diff --git a/lingua_franca/lang/common_data_gl-es.py b/lingua_franca/lang/common_data_gl-es.py new file mode 100644 index 00000000..6e299a5a --- /dev/null +++ b/lingua_franca/lang/common_data_gl-es.py @@ -0,0 +1,306 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# NOTE: This file as no use yet. It needs to be called from other functions + +from collections import OrderedDict + + +_ARTICLES_GL-ES = {'o', 'a', 'os', 'as'} + +_NUM_STRING_GL-ES = { + 0: 'cero', + 1: 'un', + 2: 'dous', + 3: 'tres', + 4: 'catro', + 5: 'cinco', + 6: 'seis', + 7: 'sete', + 8: 'oito', + 9: 'nove', + 10: 'dez', + 11: 'once', + 12: 'doce', + 13: 'trece', + 14: 'catorce', + 15: 'quince', + 16: 'dezaseis', + 17: 'dezasete', + 18: 'dezaoito', + 19: 'dezanove', + 20: 'vinte', + 30: 'trinta', + 40: 'corenta', + 50: 'cincuenta', + 60: 'sesenta', + 70: 'setenta', + 80: 'oitenta', + 90: 'noventa' +} + +_STRING_NUM_GL-ES = { + "cero": 0, + "un": 1, + "unha": 1, + "dous": 2, + "tres": 3, + "catro": 4, + "cinco": 5, + "seis": 6, + "sete": 7, + "oito": 8, + "nove": 9, + "dez": 10, + "once": 11, + "doce": 12, + "trece": 13, + "catorce": 14, + "quince": 15, + "dezaseis": 16, + "dezasete": 17, + "dezaoito": 18, + "dezanove": 19, + "vinte": 20, + "vinte e un": 21, + "vinte e dous": 22, + "vinte e tres": 23, + "vinte e catro": 24, + "vinte e cinco": 25, + "vinte e seis": 26, + "vinte e sete": 27, + "vinte e oito": 28, + "vinte e nove": 29, + "trinta": 30, + "corenta": 40, + "cincuenta": 50, + "sesenta": 60, + "setenta": 70, + "oitenta": 80, + "noventa": 90, + "cen": 100, + "cento": 100, + "douscentos": 200, + "duascentas": 200, + "trescentos": 300, + "trescentas": 300, + "catrocentos": 400, + "catrocentas": 400, + "cincocentos": 500, + "cincocentas": 500, + "seiscentos": 600, + "seiscentas": 600, + "setecentos": 700, + "setecentas": 700, + "oitocentos": 800, + "oitocentas": 800, + "novecentos": 900, + "novecentas": 900, + "mil": 1000} + + +_FRACTION_STRING_GL-ES = { + 2: 'medio', + 3: 'terzo', + 4: 'cuarto', + 5: 'quinto', + 6: 'sexto', + 7: 'séptimo', + 8: 'oitavo', + 9: 'noveno', + 10: 'décimo', + 11: 'onceavo', + 12: 'doceavo', + 13: 'treceavo', + 14: 'catorceavo', + 15: 'quinceavo', + 16: 'dezaseisavo', + 17: 'dezaseteavo', + 18: 'dezaoitoavo', + 19: 'dezanoveavo', + 20: 'vinteavo' +} + +# https://www.grobauer.at/es_eur/zahlnamen.php +_LONG_SCALE_GL-ES = OrderedDict([ + (100, 'centena'), + (1000, 'millar'), + (1000000, 'millón'), + (1e9, "millardo"), + (1e12, "billón"), + (1e18, 'trillón'), + (1e24, "cuatrillón"), + (1e30, "quintillón"), + (1e36, "sextillón"), + (1e42, "septillón"), + (1e48, "octillón"), + (1e54, "nonillón"), + (1e60, "decillón"), + (1e66, "undecillón"), + (1e72, "duodecillón"), + (1e78, "tredecillón"), + (1e84, "cuatrodecillón"), + (1e90, "quindecillón"), + (1e96, "sexdecillón"), + (1e102, "septendecillón"), + (1e108, "octodecillón"), + (1e114, "novendecillón"), + (1e120, "vigintillón"), + (1e306, "unquinquagintillón"), + (1e312, "duoquinquagintillón"), + (1e336, "sexquinquagintillón"), + (1e366, "unsexagintillón") +]) + + +_SHORT_SCALE_GL-ES = OrderedDict([ + (100, 'centena'), + (1000, 'millar'), + (1000000, 'millón'), + (1e9, "billón"), + (1e12, 'trillón'), + (1e15, "cuatrillón"), + (1e18, "quintillón"), + (1e21, "sextillón"), + (1e24, "septillón"), + (1e27, "octillón"), + (1e30, "nonillón"), + (1e33, "decillón"), + (1e36, "undecillón"), + (1e39, "duodecillón"), + (1e42, "tredecillón"), + (1e45, "cuatrodecillón"), + (1e48, "quindecillón"), + (1e51, "sexdecillón"), + (1e54, "septendecillón"), + (1e57, "octodecillón"), + (1e60, "novendecillón"), + (1e63, "vigintillón"), + (1e66, "unvigintillón"), + (1e69, "unovigintillón"), + (1e72, "tresvigintillón"), + (1e75, "quattuorvigintillón"), + (1e78, "quinquavigintillón"), + (1e81, "qesvigintillón"), + (1e84, "septemvigintillón"), + (1e87, "octovigintillón"), + (1e90, "novemvigintillón"), + (1e93, "trigintillón"), + (1e96, "untrigintillón"), + (1e99, "duotrigintillón"), + (1e102, "trestrigintillón"), + (1e105, "quattuortrigintillón"), + (1e108, "quinquatrigintillón"), + (1e111, "sestrigintillón"), + (1e114, "septentrigintillón"), + (1e117, "octotrigintillón"), + (1e120, "noventrigintillón"), + (1e123, "quadragintillón"), + (1e153, "quinquagintillón"), + (1e183, "sexagintillón"), + (1e213, "septuagintillón"), + (1e243, "octogintillón"), + (1e273, "nonagintillón"), + (1e303, "centillón"), + (1e306, "uncentillón"), + (1e309, "duocentillón"), + (1e312, "trescentillón"), + (1e333, "decicentillón"), + (1e336, "undecicentillón"), + (1e363, "viginticentillón"), + (1e366, "unviginticentillón"), + (1e393, "trigintacentillón"), + (1e423, "quadragintacentillón"), + (1e453, "quinquagintacentillón"), + (1e483, "sexagintacentillón"), + (1e513, "septuagintacentillón"), + (1e543, "octogintacentillón"), + (1e573, "nonagintacentillón"), + (1e603, "ducentillón"), + (1e903, "trecentillón"), + (1e1203, "quadringentillón"), + (1e1503, "quingentillón"), + (1e1803, "sexcentillón"), + (1e2103, "septingentillón"), + (1e2403, "octingentillón"), + (1e2703, "nongentillón"), + (1e3003, "millinillón") +]) + +# TODO: female forms. +_ORDINAL_STRING_BASE_GL-ES = { + 1: 'primeiro', + 2: 'segundo', + 3: 'terceiro', + 4: 'cuarto', + 5: 'quinto', + 6: 'sexto', + 7: 'séptimo', + 8: 'oitavo', + 9: 'noveno', + 10: 'décimo', + 11: 'undécimo', + 12: 'duodécimo', + 13: 'decimoterceiro', + 14: 'decimocuarto', + 15: 'decimoquinto', + 16: 'decimosexto', + 17: 'decimoséptimo', + 18: 'decimoitavo', + 19: 'decimonoveno', + 20: 'vixésimo', + 30: 'trixésimo', + 40: "cuadraxésimo", + 50: "quincuaxésimo", + 60: "sexaxésimo", + 70: "septuaxésimo", + 80: "octoxésimo", + 90: "nonaxésimo", + 10e3: "centésimo", + 1e3: "milésimo" +} + + +_SHORT_ORDINAL_STRING_GL-ES = { + 1e6: "millonésimo", + 1e9: "milmillonésimo", + 1e12: "billonésimo", + 1e15: "milbillonésimo", + 1e18: "trillonésimo", + 1e21: "miltrillonésimo", + 1e24: "cuatrillonésimo", + 1e27: "milcuatrillonésimo", + 1e30: "quintillonésimo", + 1e33: "milquintillonésimo" + # TODO > 1e-33 +} +_SHORT_ORDINAL_STRING_GL-ES.update(_ORDINAL_STRING_BASE_GL-ES) + + +_LONG_ORDINAL_STRING_GL-ES = { + 1e6: "millonésimo", + 1e12: "billonésimo", + 1e18: "trillonésimo", + 1e24: "cuatrillonésimo", + 1e30: "quintillonésimo", + 1e36: "sextillonésimo", + 1e42: "septillonésimo", + 1e48: "octillonésimo", + 1e54: "nonillonésimo", + 1e60: "decillonésimo" + # TODO > 1e60 +} +_LONG_ORDINAL_STRING_GL-ES.update(_ORDINAL_STRING_BASE_GL-ES) diff --git a/lingua_franca/lang/format_gl-es.py b/lingua_franca/lang/format_gl-es.py new file mode 100644 index 00000000..f5631bd0 --- /dev/null +++ b/lingua_franca/lang/format_gl-es.py @@ -0,0 +1,242 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Format functions for galician (gl-es) +""" +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_es import _NUM_STRING_GL-ES, \ + _FRACTION_STRING_GL-ES + + +def nice_number_es(number, speech=True, denominators=range(1, 21)): + """ Galician helper for nice_number + This function formats a float to human understandable functions. Like + 4.5 becomes "4 e medio" for speech and "4 1/2" for text + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + strNumber = "" + whole = 0 + num = 0 + den = 0 + + result = convert_to_mixed_fraction(number, denominators) + + if not result: + # Give up, just represent as a 3 decimal number + whole = round(number, 3) + else: + whole, num, den = result + + if not speech: + if num == 0: + strNumber = '{:,}'.format(whole) + strNumber = strNumber.replace(",", " ") + strNumber = strNumber.replace(".", ",") + return strNumber + else: + return '{} {}/{}'.format(whole, num, den) + else: + if num == 0: + # if the number is not a fraction, nothing to do + strNumber = str(whole) + strNumber = strNumber.replace(".", ",") + return strNumber + den_str = _FRACTION_STRING_GL-ES[den] + # if it is not an integer + if whole == 0: + # if there is no whole number + if num == 1: + # if numerator is 1, return "un medio", for example + strNumber = 'un {}'.format(den_str) + else: + # else return "catro terzos", for example + strNumber = '{} {}'.format(num, den_str) + elif num == 1: + # if there is a whole number and numerator is 1 + if den == 2: + # if denominator is 2, return "1 e medio", for example + strNumber = '{} y {}'.format(whole, den_str) + else: + # else return "1 e 1 terzo", for example + strNumber = '{} y 1 {}'.format(whole, den_str) + else: + # else return "2 e 3 cuarto", for example + strNumber = '{} y {} {}'.format(whole, num, den_str) + if num > 1 and den != 3: + # if the numerator is greater than 1 and the denominator + # is not 3 ("terzo"), add an s for plural + strNumber += 's' + + return strNumber + + +def pronounce_number_gl-es(number, places=2): + """ + Convert a number to it's spoken equivalent + For example, '5.2' would return 'cinco coma dous' + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + Returns: + (str): The pronounced number + """ + if abs(number) >= 100: + # TODO: Soporta os números por riba de 100 + return str(number) + + result = "" + if number < 0: + result = "menos " + number = abs(number) + + elif number >= 30: # do 20 en diante + tens = int(number-int(number) % 10) + ones = int(number - tens) + result += _NUM_STRING_GL-ES[tens] + if ones > 0: + result += " y " + _NUM_STRING_GL-ES[ones] + else: + result += _NUM_STRING_GL-ES[int(number)] + + # Deal with decimal part, in galician is commonly used the comma + # instead dot. Decimal part can be written both with comma + # and dot, but when pronounced, its pronounced "coma" + if not number == int(number) and places > 0: + if abs(number) < 1.0 and (result == "menos " or not result): + result += "cero" + result += " coma" + _num_str = str(number) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + _NUM_STRING_GL-ES[int(char)] + return result + + +def nice_time_es(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'cinco trinta' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + # Temos que ter en conta que cando falamos en formato + # 24h, no hai que especificar ningunha precisión adicional + # como "pola noite", "pola tarde" ou "pola mañá". + if dt.hour == 1: + speak += "a unha" + else: + speak += "as" + pronounce_number_gl-es(dt.hour) + + # as 14:04 son "as catorce cero catro" + if dt.minute < 10: + speak += " cero " + pronounce_number_gl-es(dt.minute) + else: + speak += " " + pronounce_number_gl-es(dt.minute) + + else: + # Prepare for "tres menos cuarto" ?? + if dt.minute == 35: + minute = -25 + hour = dt.hour + 1 + elif dt.minute == 40: + minute = -20 + hour = dt.hour + 1 + elif dt.minute == 45: + minute = -15 + hour = dt.hour + 1 + elif dt.minute == 50: + minute = -10 + hour = dt.hour + 1 + elif dt.minute == 55: + minute = -5 + hour = dt.hour + 1 + else: + minute = dt.minute + hour = dt.hour + + if hour == 0 or hour == 12: + speak += "las doce" + elif hour == 1 or hour == 13: + speak += "a unha" + elif hour < 13: + speak = "las " + pronounce_number_gl-es(hour) + else: + speak = "las " + pronounce_number_gl-es(hour-12) + + if minute != 0: + # as horas especiais + if minute == 15: + speak += " e cuarto" + elif minute == 30: + speak += " e media" + elif minute == -15: + speak += " menos cuarto" + else: # seis e nove. sete e veinte e cinco + if minute > 0: + speak += " e " + pronounce_number_gl-es(minute) + else: # se son as sete menos vinte, non poñemos o "e" + speak += " " + pronounce_number_gl-es(minute) + + # se non especificamos pola tarde, noite, mañá, etc + if minute == 0 and not use_ampm: + # 3:00 + speak += " en punto" + + if use_ampm: + # pola noche"é desde que anoitece ata medianoite + # así que dicir que é desde as 21h é algo subxectivo + # en España ás 20h díselle "pola tarde" + # en galego, ás 12h é pola mañá ou mediodía + # así que diremos "pola tarde" a partir das 13h. + if hour >= 0 and hour < 6: + speak += "pola madrugada" + elif hour >= 6 and hour < 13: + speak += "pola mañá" + elif hour >= 13 and hour < 21: + speak += "pola tarde" + else: + speak += "pola noite" + return speak diff --git a/lingua_franca/lang/parse_gl-es.py b/lingua_franca/lang/parse_gl-es.py new file mode 100644 index 00000000..040f1b88 --- /dev/null +++ b/lingua_franca/lang/parse_gl-es.py @@ -0,0 +1,1100 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime +from dateutil.relativedelta import relativedelta + +from lingua_franca.time import now_local +from lingua_franca.lang.format_gl-es import pronounce_number_gl-es +from lingua_franca.lang.parse_common import * +from lingua_franca.lang.common_data_gl-es import _ARTICLES_GL-ES, _STRING_NUM_GL-ES + + +def is_fractional_gl-es(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + Args: + text (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + """ + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + + aFrac = {"medio": 2, "media": 2, "terzo": 3, "cuarto": 4, + "cuarta": 4, "quinto": 5, "quinta": 5, "sexto": 6, "sexta": 6, + "séptimo": 7, "séptima": 7, "oitavo": 8, "oitava": 8, + "noveno": 9, "novena": 9, "décimo": 10, "décima": 10, + "onceavo": 11, "onceava": 11, "doceavo": 12, "doceava": 12} + + if input_str.lower() in aFrac: + return 1.0 / aFrac[input_str] + if (input_str == "vixésimo" or input_str == "vixésima"): + return 1.0 / 20 + if (input_str == "trixésimo" or input_str == "trixésima"): + return 1.0 / 30 + if (input_str == "centésimo" or input_str == "centésima"): + return 1.0 / 100 + if (input_str == "milésimo" or input_str == "milésima"): + return 1.0 / 1000 + return False + + +def extract_number_gl-es(text, short_scale=True, ordinals=False): + """ + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + Args: + text (str): the string to normalize + Returns: + (int) or (float): The value of extracted number + """ + # TODO: short_scale and ordinals don't do anything here. + # The parameters are present in the function signature for API compatibility + # reasons. + # + # Returns incorrect output on certain fractional phrases like, "cuarto de dous" + # TODO: numbers greater than 999999 + aWords = text.lower().split() + count = 0 + result = None + while count < len(aWords): + val = 0 + word = aWords[count] + next_next_word = None + if count + 1 < len(aWords): + next_word = aWords[count + 1] + if count + 2 < len(aWords): + next_next_word = aWords[count + 2] + else: + next_word = None + + # is current word a number? + if word in _STRING_NUM_GL-ES: + val = _STRING_NUM_GL-ES[word] + elif word.isdigit(): # doesn't work with decimals + val = int(word) + elif is_numeric(word): + val = float(word) + elif is_fractional_es(word): + if not result: + result = 1 + result = result * is_fractional_es(word) + count += 1 + continue + + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + # if (len(aPieces) == 2 and is_numeric(aPieces[0]) + # and is_numeric(aPieces[1])): + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + + if val: + if result is None: + result = 0 + # handle fractions + if next_word != "avos": + result = val + else: + result = float(result) / float(val) + + if next_word is None: + break + + # number word and fraction + ands = ["e"] + if next_word in ands: + zeros = 0 + if result is None: + count += 1 + continue + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + + afterAndVal = extract_number_gl-es(newText[:-1]) + if afterAndVal: + if result < afterAndVal or result < 20: + while afterAndVal > 1: + afterAndVal = afterAndVal / 10.0 + for word in newWords: + if word == "cero" or word == "0": + zeros += 1 + else: + break + for _ in range(0, zeros): + afterAndVal = afterAndVal / 10.0 + result += afterAndVal + break + elif next_next_word is not None: + if next_next_word in ands: + newWords = aWords[count + 3:] + newText = "" + for word in newWords: + newText += word + " " + afterAndVal = extract_number_gl-es(newText[:-1]) + if afterAndVal: + if result is None: + result = 0 + result += afterAndVal + break + + decimals = ["punto", "coma", ".", ","] + if next_word in decimals: + zeros = 0 + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + for word in newWords: + if word == "cero" or word == "0": + zeros += 1 + else: + break + afterDotVal = str(extract_number_gl-es(newText[:-1])) + afterDotVal = zeros * "0" + afterDotVal + result = float(str(result) + "." + afterDotVal) + break + count += 1 + + # Return the $str with the number related words removed + # (now empty strings, so strlen == 0) + # aWords = [word for word in aWords if len(word) > 0] + # text = ' '.join(aWords) + if "." in str(result): + integer, dec = str(result).split(".") + # cast float to int + if dec == "0": + result = int(integer) + + return result or False + + +def _gl-es_number_parse(words, i): + # TODO Not parsing 'cero' + + def gl-es_cte(i, s): + if i < len(words) and s == words[i]: + return s, i + 1 + return None + + def gl-es_number_word(i, mi, ma): + if i < len(words): + v = _STRING_NUM_ES.get(words[i]) + if v and v >= mi and v <= ma: + return v, i + 1 + return None + + def gl-es_number_1_99(i): + r1 = es_number_word(i, 1, 29) + if r1: + return r1 + + r1 = gl-es_number_word(i, 30, 90) + if r1: + v1, i1 = r1 + r2 = es_cte(i1, "y") + if r2: + i2 = r2[1] + r3 = gl-es_number_word(i2, 1, 9) + if r3: + v3, i3 = r3 + return v1 + v3, i3 + return r1 + return None + + def gl-es_number_1_999(i): + # [2-9]centos [1-99]? + r1 = gl-es_number_word(i, 100, 900) + if r1: + v1, i1 = r1 + r2 = gl-es_number_1_99(i1) + if r2: + v2, i2 = r2 + return v1 + v2, i2 + else: + return r1 + + # [1-99] + r1 = gl-es_number_1_99(i) + if r1: + return r1 + + return None + + def gl-es_number(i): + # check for cero + r1 = gl-es_number_word(i, 0, 0) + if r1: + return r1 + + # check for [1-999] (mil [0-999])? + r1 = gl-es_number_1_999(i) + if r1: + v1, i1 = r1 + r2 = es_cte(i1, "mil") + if r2: + i2 = r2[1] + r3 = gl-es_number_1_999(i2) + if r3: + v3, i3 = r3 + return v1 * 1000 + v3, i3 + else: + return v1 * 1000, i2 + else: + return r1 + return None + + return gl-es_number(i) + + +def extract_numbers_gl-es(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + return extract_numbers_generic(text, pronounce_number_gl-es, + extract_number_gl-es, short_scale=short_scale, + ordinals=ordinals) + + +def normalize_gl-es(text, remove_articles=True): + """ Galician string normalization """ + # TODO return GalicianNormalizer().normalize(text, remove_articles) + words = text.split() # this also removed extra spaces + + normalized = "" + i = 0 + while i < len(words): + word = words[i] + + if remove_articles and word in _ARTICLES_GL-ES: + i += 1 + continue + + # Convert numbers into digits + r = _gl-es_number_parse(words, i) + if r: + v, i = r + normalized += " " + str(v) + continue + + normalized += " " + word + i += 1 + + return normalized[1:] # strip the initial space + + +# TODO MycroftAI/mycroft-core#2348 +def extract_datetime_gl-es(text, anchorDate=None, default_time=None): + def clean_string(s): + # cleans the input string of unneeded punctuation and capitalization + # among other things + symbols = [".", ",", ";", "?", "!", "º", "ª"] + noise_words = ["entre", "a", "do", "ao", "o", "de", + "para", "unha", "calquera", "a", + "e'", "esta", "este"] + + for word in symbols: + s = s.replace(word, "") + for word in noise_words: + s = s.replace(" " + word + " ", " ") + s = s.lower().replace( + "á", + "a").replace( + "é", + "e").replace( + "ó", + "o").replace( + "-", + " ").replace( + "_", + "") + # handle synonyms and equivalents, "tomorrow early = tomorrow morning + synonyms = {"mañá": ["amañecer", "cedo", "moi cedo"], + "tarde": ["media tarde", "atardecer"], + "noite": ["anoitecer", "tarde"]} + for syn in synonyms: + for word in synonyms[syn]: + s = s.replace(" " + word + " ", " " + syn + " ") + # relevant plurals, cant just extract all s in pt + wordlist = ["mañás", "tardes", "noites", "días", "semanas", + "anos", "minutos", "segundos", "as", "os", "seguintes", + "próximas", "próximos", "horas"] + for _, word in enumerate(wordlist): + s = s.replace(word, word.rstrip('s')) + s = s.replace("meses", "mes").replace("anteriores", "anterior") + return s + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + if anchorDate is None: + anchorDate = now_local() + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + words = clean_string(text).split(" ") + timeQualifiersList = ['mañá', 'tarde', 'noite'] + time_indicators = ["en", "a", "ao", "por", "pasados", + "pasadas", "día", "hora"] + days = ['luns', 'martes', 'mércores', + 'xoves', 'vernes', 'sábado', 'domingo'] + months = ['xaneiro', 'febreiro', 'marzo', 'abril', 'maio', 'xuño', + 'xullo', 'agosto', 'setembro', 'outubro', 'novembro', + 'decembro'] + monthsShort = ['xan', 'feb', 'mar', 'abr', 'mai', 'xuñ', 'xul', 'ago', + 'set', 'out', 'nov', 'dec'] + nexts = ["seguinte", "próximo", "próxima"] + suffix_nexts = ["seguintes", "subsecuentes"] + lasts = ["último", "última"] + suffix_lasts = ["pasada", "pasado", "anterior", "antes"] + nxts = ["despois", "seguinte", "próximo", "próxima"] + prevs = ["antes", "previa", "previo", "anterior"] + froms = ["desde", "en", "para", "despois de", "por", "próximo", + "próxima", "de"] + thises = ["este", "esta"] + froms += thises + lists = nxts + prevs + froms + time_indicators + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + + # parse today, tomorrow, yesterday + elif word == "hoxe" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "mañá" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "onte" and not fromFlag: + dayOffset -= 1 + used += 1 + # "before yesterday" and "before before yesterday" + elif (word == "antonte" or + (word == "antes" and wordNext == "de onte")) and not fromFlag: + dayOffset -= 2 + used += 1 + if wordNext == "onte": + used += 1 + elif word == "antes" and wordNext == "antes" and wordNextNext == \ + "de onte" and not fromFlag: + dayOffset -= 3 + used += 3 + elif word == "antes de antonte" and not fromFlag: + dayOffset -= 3 + used += 1 + # day after tomorrow + elif word == "pasado" and wordNext == "mañá" and not fromFlag: + dayOffset += 2 + used = 2 + # day before yesterday + elif word == "antes" and wordNext == "de antonte" and not fromFlag: + dayOffset -= 2 + used = 2 + # parse 5 days, 10 weeks, last week, next week, week after + elif word == "día": + if wordNext == "pasado" or wordNext == "ante": + used += 1 + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used += 1 + elif (wordPrev and wordPrev[0].isdigit() and + wordNext not in months and + wordNext not in monthsShort): + dayOffset += int(wordPrev) + start -= 1 + used += 2 + elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ + months and wordNextNext not in monthsShort: + dayOffset += int(wordNext) + start -= 1 + used += 2 + + elif word == "semana" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + dayOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "mes" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + monthOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + monthOffset = -7 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "ano" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + yearOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + yearOffset = -7 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "seguinte": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "pasado": + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "seguinte": + # dayOffset += 7 + used += 1 + elif wordNext == "pasado": + # dayOffset -= 7 + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + # 13 maio + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + # maio 13 + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordPrevPrev and wordPrevPrev[0].isdigit(): + # 13 dia maio + datestr += " " + wordPrevPrev + + start -= 2 + used += 2 + if wordNext and word[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNextNext and wordNextNext[0].isdigit(): + # maio dia 13 + datestr += " " + wordNextNext + used += 2 + if wordNextNextNext and wordNextNextNext[0].isdigit(): + datestr += " " + wordNextNextNext + used += 1 + hasYear = True + else: + hasYear = False + + if datestr in months: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("hoxe") + validFollowups.append("mañá") + validFollowups.append("onte") + validFollowups.append("antonte") + validFollowups.append("agora") + validFollowups.append("xa") + validFollowups.append("antes") + + # TODO debug word "despois" that one is failing for some reason + if word in froms and wordNext in validFollowups: + + if not (wordNext == "mañá" and wordNext == "onte") and not ( + word == "pasado" or word == "antes"): + used = 2 + fromFlag = True + if wordNext == "mañá" and word != "pasado": + dayOffset += 1 + elif wordNext == "onte": + dayOffset -= 1 + elif wordNext == "antonte": + dayOffset -= 2 + elif wordNext == "antes" and wordNextNext == "de onte": + dayOffset -= 2 + elif (wordNext == "antes" and wordNext == "antes" and + wordNextNextNext == "de onte"): + dayOffset -= 3 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + # if wordNextNext == "feira": + # used += 1 + if tmpOffset < 0: + tmpOffset += 7 + if wordNextNext: + if wordNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNextNextNext: + if wordNextNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + # if wordNextNextNext == "feira": + # used += 1 + if wordNext in months: + used -= 1 + if used > 0: + if start - 1 > 0 and words[start - 1] in lists: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in lists: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "medio" and wordNext == "día": + hrAbs = 12 + used += 2 + elif word == "media" and wordNext == "noite": + hrAbs = 0 + used += 2 + elif word == "mañá": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "tarde": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word == "media" and wordNext == "tarde": + if not hrAbs: + hrAbs = 17 + used += 2 + elif word == "tarde" and wordNext == "noite": + if not hrAbs: + hrAbs = 20 + used += 2 + elif word == "media" and wordNext == "mañá": + if not hrAbs: + hrAbs = 10 + used += 2 + # elif word == "a" and wordNext == "tardecer": + # if not hrAbs: + # hrAbs = 19 + # used += 2 + # elif word == "a" and wordNext == "mañecer": + # if not hrAbs: + # hrAbs = 11 + # used += 2 + elif word == "madrugada": + if not hrAbs: + hrAbs = 1 + used += 2 + elif word == "noite": + if not hrAbs: + hrAbs = 21 + used += 1 + # parse half an hour, quarter hour + elif (word == "hora" and + (wordPrev in time_indicators or wordPrevPrev in + time_indicators)): + if wordPrev == "media": + minOffset = 30 + elif wordPrev == "cuarto": + minOffset = 15 + elif wordPrevPrev == "cuarto": + minOffset = 15 + if idx > 2 and words[idx - 3] in time_indicators: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in time_indicators: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif wordNext == "mañá" or wordNext == "madrugada": + remainder = "am" + used += 1 + elif wordNext == "tarde": + remainder = "pm" + used += 1 + elif wordNext == "noite": + if 0 < int(word[0]) < 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + elif wordNext in thises and wordNextNext == "mañá": + remainder = "am" + used = 2 + elif wordNext in thises and wordNextNext == "tarde": + remainder = "pm" + used = 2 + elif wordNext in thises and wordNextNext == "noite": + remainder = "pm" + used = 2 + else: + if timeQualifier != "": + if strHH <= 12 and \ + (timeQualifier == "mañá" or + timeQualifier == "tarde"): + strHH += 12 + + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if (wordNext == "pm" or + wordNext == "p.m." or + wordNext == "tarde"): + strHH = strNum + remainder = "pm" + used = 1 + elif (wordNext == "am" or + wordNext == "a.m." or + wordNext == "mañá"): + strHH = strNum + remainder = "am" + used = 1 + elif (int(word) > 100 and + ( + # wordPrev == "o" or + # wordPrev == "oh" or + wordPrev == "cero" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hora": + used += 1 + elif ( + wordNext == "hora" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minuto": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "segundo": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hora": + used += 1 + + elif wordNext == "" or ( + wordNext == "en" and wordNextNext == "punto"): + strHH = word + strMM = 00 + if wordNext == "en" and wordNextNext == "punto": + used += 2 + if wordNextNextNext == "tarde": + remainder = "pm" + used += 1 + elif wordNextNextNext == "mañá": + remainder = "am" + used += 1 + elif wordNextNextNext == "noite": + if 0 > strHH > 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + used += 1 + if wordNextNext == "hora": + used += 1 + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if (remainder == "pm" and + 0 < strHH < 12) else strHH + strHH = strHH - 12 if (remainder == "am" and + 0 < strHH >= 12) else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "en" or wordPrev == "punto": + words[words.index(wordPrev)] = "" + + if idx > 0 and wordPrev in time_indicators: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in time_indicators: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + + if hrAbs != -1 and minAbs != -1: + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + # resultStr = pt_pruning(resultStr) + return [extractedDate, resultStr] + + +def get_gender_gl-es(word, context=""): + """ Guess the gender of a word + Some languages assign genders to specific words. This method will attempt + to determine the gender, optionally using the provided context sentence. + Args: + word (str): The word to look up + context (str, optional): String containing word, for context + Returns: + str: The code "m" (male), "f" (female) or "n" (neutral) for the gender, + or None if unknown/or unused in the given language. + """ + # Next rules are imprecise and incompleted, but is a good starting point. + word = word.rstrip("s") + gender = False + words = context.split(" ") + for idx, w in enumerate(words): + if w == word and idx != 0: + previous = words[idx - 1] + gender = get_gender_gl-es(previous) + break + if not gender: + if word[-1] == "a": + gender = "f" + if word[-1] == "o" or word[-1] == "e": + gender = "m" + return gender + + +class GalicianNormalizer(Normalizer): + """ TODO implement language specific normalizer""" diff --git a/lingua_franca/res/text/day.word b/lingua_franca/res/text/day.word new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/lingua_franca/res/text/day.word @@ -0,0 +1 @@ + diff --git a/lingua_franca/res/text/gl-es/and.word b/lingua_franca/res/text/gl-es/and.word new file mode 100644 index 00000000..d905d9da --- /dev/null +++ b/lingua_franca/res/text/gl-es/and.word @@ -0,0 +1 @@ +e diff --git a/lingua_franca/res/text/gl-es/date_time.json b/lingua_franca/res/text/gl-es/date_time.json new file mode 100644 index 00000000..70a2141b --- /dev/null +++ b/lingua_franca/res/text/gl-es/date_time.json @@ -0,0 +1,131 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^2\\d$", "format": "vinte-e-{x}"}, + "5": {"match": "^[3-9]\\d$", "format": "{x0}-{x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^1\\d{2}$", "format": "{x_in_x00}-cento"}, + "2": {"match": "^\\d{3}$", "format": "{x_in_x00}-centos"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^10\\d\\d$", "format": "mil"}, + "2": {"match": "^11\\d\\d$", "format": "mil cent"}, + "3": {"match": "^1[2-9]\\d\\d$", "format": "mil {x_in_x00}-centos"}, + "4": {"match": "^[2-9]0\\d{2}$", "format": "{x_in_x000} mil"}, + "5": {"match": "^[2-9]1\\d{2}$", "format": "{x_in_x000} mil cento"}, + "6": {"match": "^[2-9][2-9]\\d{2}$", "format": "{x_in_x000} mil {x_in_x00}-centos"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d\\d?$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "a.C." + }, + "date_format": { + "date_full": "{weekday}, {day} de {month} de {formatted_year}", + "date_full_no_year": "{weekday}, {day} de {month}", + "date_full_no_year_month": "{weekday}, día {day}", + "today": "hoxe", + "tomorrow": "mañá", + "yesterday": "onte" + }, + "date_time_format": { + "date_time": "{formatted_date} a {formatted_time}" + }, + "weekday": { + "0": "luns", + "1": "martes", + "2": "mércores", + "3": "xoves", + "4": "venres", + "5": "sábado", + "6": "domingo", + + }, + "date": { + "1": "un/unha", + "2": "dous", + "3": "tres", + "4": "catro", + "5": "cinco", + "6": "seis", + "7": "sete", + "8": "oito", + "9": "nove", + "10": "dez", + "11": "once", + "12": "doce", + "13": "trece", + "14": "catorce", + "15": "quince", + "16": "dezaseis", + "17": "dezasete", + "18": "dezaoito", + "19": "dezanove", + "20": "vinte", + "21": "vinte-e-un/unha", + "22": "vinte-e-dous", + "23": "vinte-e-tres", + "24": "vinte-e-catro", + "25": "vinte-e-cinco", + "26": "vinte-e-seis", + "27": "vinte-e-sete", + "28": "vinte-e-oito", + "29": "vinte-e-nove", + "30": "trinta", + "31": "trinta-e-un/unha" + }, + "month": { + "1": "xaneiro", + "2": "febreiro", + "3": "marzo", + "4": "abril", + "5": "maio", + "6": "xuño", + "7": "xullo", + "8": "agosto", + "9": "setembro", + "10": "outubro", + "11": "novembro", + "12": "decembro" + }, + "number": { + "0": "cero", + "1": "un/unha", + "2": "dous", + "3": "tres", + "4": "catro", + "5": "cinco", + "6": "seis", + "7": "sete", + "8": "oito", + "9": "nove", + "10": "dez", + "11": "once", + "12": "doce", + "13": "trece", + "14": "catorce", + "15": "quince", + "16": "dezaseis", + "17": "dezasete", + "18": "dezaoito", + "19": "dezanove", + "20": "vinte", + "30": "trinta", + "40": "corenta", + "50": "cincuenta", + "60": "sesenta", + "70": "setenta", + "80": "oitenta", + "90": "noventa" + } +} diff --git a/lingua_franca/res/text/gl-es/date_time_test.json b/lingua_franca/res/text/gl-es/date_time_test.json new file mode 100644 index 00000000..4d9f09e4 --- /dev/null +++ b/lingua_franca/res/text/gl-es/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "un a.C." }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "dez a.C." }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "noventa-e-dous a.C." }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "oitocentos tres" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "oitocentos once" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "catrocentos cincuenta-e-catro" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil cinco" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil doce" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil corenta-e-seis" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil oitocentos sete" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil setecentos dezasete" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil novecentos oitenta-e-oito"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dous mil nove"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dous mil dezaoito"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dous mil vinte-e-un"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dous mil trinta"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dous mil cen" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "mil" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dous mil" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tres mil cento vinte a.C." }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "tres mil douscentos corenta-e-un a.C." }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "cinco mil douscentos" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "mil cen" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "dous mil cen" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "martes, trinta-e-un de xaneiro de dous mil dezasete"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "domingo, catro de febreiro de dous mil dezaoito"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "domingo, catro de febreiro"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "domingo, día catro"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "mañá"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "hoxe"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "onte"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "domingo, catro de febreiro"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "domingo, catro de febreiro de dous mil dezaoito"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "martes, trinta-e-un de xaneiro de dous mil dezasete á unha e vinte-e-dous da tarde"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "martes, trinta-e-un de xaneiro de dous mil dezasete ás trece e vinte-e-dous"} + } +} diff --git a/lingua_franca/res/text/gl-es/day.word b/lingua_franca/res/text/gl-es/day.word new file mode 100644 index 00000000..1f65386b --- /dev/null +++ b/lingua_franca/res/text/gl-es/day.word @@ -0,0 +1 @@ +día diff --git a/lingua_franca/res/text/gl-es/days.word b/lingua_franca/res/text/gl-es/days.word new file mode 100644 index 00000000..3b1c6763 --- /dev/null +++ b/lingua_franca/res/text/gl-es/days.word @@ -0,0 +1 @@ +días diff --git a/lingua_franca/res/text/gl-es/hour.word b/lingua_franca/res/text/gl-es/hour.word new file mode 100644 index 00000000..cb26009f --- /dev/null +++ b/lingua_franca/res/text/gl-es/hour.word @@ -0,0 +1 @@ +hora diff --git a/lingua_franca/res/text/gl-es/hours.word b/lingua_franca/res/text/gl-es/hours.word new file mode 100644 index 00000000..49f065da --- /dev/null +++ b/lingua_franca/res/text/gl-es/hours.word @@ -0,0 +1 @@ +horas diff --git a/lingua_franca/res/text/gl-es/minute.word b/lingua_franca/res/text/gl-es/minute.word new file mode 100644 index 00000000..9b638826 --- /dev/null +++ b/lingua_franca/res/text/gl-es/minute.word @@ -0,0 +1 @@ +minuto diff --git a/lingua_franca/res/text/gl-es/minutes.word b/lingua_franca/res/text/gl-es/minutes.word new file mode 100644 index 00000000..5028337e --- /dev/null +++ b/lingua_franca/res/text/gl-es/minutes.word @@ -0,0 +1 @@ +minutos diff --git a/lingua_franca/res/text/gl-es/normalize.json b/lingua_franca/res/text/gl-es/normalize.json new file mode 100644 index 00000000..e0191e1d --- /dev/null +++ b/lingua_franca/res/text/gl-es/normalize.json @@ -0,0 +1,106 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": false, + "remove_symbols": true, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": true, + "contractions": {}, + "word_replacements": {}, + "number_replacements": { + "catorce": "14", + "cen": "100", + "centos": "100", + "cinco": "5", + "cincocentas": "500", + "cincocentos": "500", + "cincuenta": "50", + "dez": "10", + "dezanove": "19", + "dezaseis": "16", + "dezasete": "17", + "dezaoito": "18", + "dezaoito": "18", + "dous": "2", + "duascentas": "200", + "douscentos": "200", + "doce": "12", + "dúas": "2", + "duascentas": "200", + "oitenta": "80", + "oito": "8", + "oitocentas": "800", + "oitocentos": "800", + "mil": "1000", + "millón": "1000000", + "nove": "9", + "novecentas": "900", + "novecentos": "900", + "noventa": "90", + "once": "11", + "primeiro": "1", + "primeira": "1", + "corenta": "40", + "catro": "4", + "catrocentas": "400", + "catrocentos": "400", + "quince": "15", + "segundo": "2", + "segunda": "2", + "sesenta": "60", + "sete": "7", + "setecentas": "700", + "setecentos": "700", + "setenta": "70", + "seis": "6", + "seiscentas": "600", + "seiscentos": "600", + "terceiro": "3", + "trinta": "30", + "tres": "3", + "trescentas": "300", + "trescentos": "300", + "trece": "13", + "un": "1", + "unha": "1", + "vinte": "20", + "oitenta": "80", + "oito": "8", + "oitocentas": "800", + "oitocentos": "800", + "cero": "0" + }, + "stopwords": [ + "de", + "do", + "dos", + "el", + "ela", + "eles", + "elas", + "eu", + "e", + "ao", + "ata o", + "á", + "nós", + "ata o", + "para", + "este", + "esta", + "estes", + "estas", + "aquel", + "aquela", + "aqueles", + "aquela", + "que" + ], + "articles": [ + "o", + "a", + "os", + "as" + ] +} diff --git a/lingua_franca/res/text/gl-es/or.word b/lingua_franca/res/text/gl-es/or.word new file mode 100644 index 00000000..d96c19d8 --- /dev/null +++ b/lingua_franca/res/text/gl-es/or.word @@ -0,0 +1 @@ +ou diff --git a/lingua_franca/res/text/gl-es/second.word b/lingua_franca/res/text/gl-es/second.word new file mode 100644 index 00000000..9c41ac63 --- /dev/null +++ b/lingua_franca/res/text/gl-es/second.word @@ -0,0 +1 @@ +segundo diff --git a/lingua_franca/res/text/gl-es/seconds.word b/lingua_franca/res/text/gl-es/seconds.word new file mode 100644 index 00000000..f9955ce3 --- /dev/null +++ b/lingua_franca/res/text/gl-es/seconds.word @@ -0,0 +1 @@ +segundos diff --git a/test/test_format_gl-es.py b/test/test_format_gl-es.py new file mode 100644 index 00000000..3adc2100 --- /dev/null +++ b/test/test_format_gl-es.py @@ -0,0 +1,350 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest +import datetime + +from lingua_franca import load_language, unload_language, set_default_lang +from lingua_franca.format import nice_number +from lingua_franca.format import nice_time +from lingua_franca.format import pronounce_number +from lingua_franca.time import default_timezone + + +def setUpModule(): + load_language('gl-es') + set_default_lang('gl-es') + + +def tearDownModule(): + unload_language('gl-es') + + +NUMBERS_FIXTURE_GL-ES = { + 1.435634: '1,436', + 2: '2', + 5.0: '5', + 0.027: '0,027', + 0.5: 'un medio', + 1.333: '1 e 1 terzo', + 2.666: '2 e 2 terzo', + 0.25: 'un cuarto', + 1.25: '1 e 1 cuarto', + 0.75: '3 cuartos', + 1.75: '1 e 3 cuartos', + 3.4: '3 e 2 quintos', + 16.8333: '16 e 5 sextos', + 12.5714: '12 e 4 séptimos', + 9.625: '9 e 5 oitavos', + 6.777: '6 e 7 novenos', + 3.1: '3 e 1 décimo', + 2.272: '2 e 3 onceavos', + 5.583: '5 e 7 doceavos', + 8.384: '8 e 5 treceavos', + 0.071: 'un catorceavo', + 6.466: '6 e 7 quinceavos', + 8.312: '8 e 5 dezaseisavos', + 2.176: '2 e 3 dezaseteavos', + 200.722: '200 e 13 dezaoitoavos', + 7.421: '7 e 8 dezanoveavos', + 0.05: 'un vinteavo' + +} + + +class TestNiceNumberFormat_gl-es(unittest.TestCase): + def test_convert_float_to_nice_number_gl-es(self): + for number, number_str in NUMBERS_FIXTURE_GL-ES.items(): + self.assertEqual(nice_number(number, lang="gl-es"), number_str, + 'should format {} as {} and not {}'.format( + number, number_str, nice_number( + number, lang="gl-es"))) + + def test_specify_denominator_gl-es(self): + self.assertEqual(nice_number(5.5, lang="gl-es", + denominators=[1, 2, 3]), + '5 e medio', + 'should format 5.5 as 5 e medio not {}'.format( + nice_number(5.5, lang="gl-es", + denominators=[1, 2, 3]))) + self.assertEqual(nice_number(2.333, lang="gl-es", + denominators=[1, 2]), + '2,333', + 'should format 2.333 as 2,333 not {}'.format( + nice_number(2.333, lang="gl-es", + denominators=[1, 2]))) + + def test_no_speech_gl-es(self): + self.assertEqual(nice_number(6.777, lang="gl-es", speech=False), + '6 7/9', + 'should format 6.777 as 6 7/9 not {}'.format( + nice_number(6.777, lang="gl-es", speech=False))) + self.assertEqual(nice_number(6.0, lang="gl-es", speech=False), + '6', + 'should format 6.0 as 6 not {}'.format( + nice_number(6.0, lang="gl-es", speech=False))) + self.assertEqual(nice_number(1234567890, lang="gl-es", speech=False), + '1 234 567 890', + 'should format 1234567890 as' + '1 234 567 890 not {}'.format( + nice_number(1234567890, lang="gl-es", + speech=False))) + self.assertEqual(nice_number(12345.6789, lang="gl-es", speech=False), + '12 345,679', + 'should format 12345.6789 as' + '12 345,679 not {}'.format( + nice_number(12345.6789, lang="gl-es", + speech=False))) + + +class TestPronounceNumber(unittest.TestCase): + def test_convert_int(self): + self.assertEqual(pronounce_number(0, lang="gl-es"), "cero") + self.assertEqual(pronounce_number(1, lang="gl-es"), "un") + self.assertEqual(pronounce_number(10, lang="gl-es"), "dez") + self.assertEqual(pronounce_number(15, lang="gl-es"), "quince") + self.assertEqual(pronounce_number(21, lang="gl-es"), "vinte e un") + self.assertEqual(pronounce_number(27, lang="gl-es"), "vinte e sete") + self.assertEqual(pronounce_number(30, lang="gl-es"), "trinta") + self.assertEqual(pronounce_number(19, lang="gl-es"), "dezanove") + self.assertEqual(pronounce_number(88, lang="gl-es"), "oitenta e oito") + self.assertEqual(pronounce_number(46, lang="gl-es"), "corenta e seis") + self.assertEqual(pronounce_number(99, lang="gl-es"), "noventa e nove") + + def test_convert_negative_int(self): + self.assertEqual(pronounce_number(-1, lang="gl-es"), "menos un") + self.assertEqual(pronounce_number(-10, lang="gl-es"), "menos dez") + self.assertEqual(pronounce_number(-15, lang="gl-es"), "menos quince") + self.assertEqual(pronounce_number(-21, lang="gl-es"), "menos vinte e un") + self.assertEqual(pronounce_number(-27, lang="gl-es"), "menos vinte e sete") + self.assertEqual(pronounce_number(-30, lang="gl-es"), "menos trinta") + self.assertEqual(pronounce_number(-35, lang="gl-es"), + "menos trinta e cinco") + self.assertEqual(pronounce_number(-83, lang="gl-es"), + "menos oitenta e tres") + self.assertEqual(pronounce_number(-19, lang="gl-es"), "menos dezanove") + self.assertEqual(pronounce_number(-88, lang="gl-es"), + "menos oitenta e oito") + self.assertEqual(pronounce_number(-46, lang="gl-es"), + "menos corenta e seis") + self.assertEqual(pronounce_number(-99, lang="gl-es"), + "menos noventa e nove") + + def test_convert_decimals(self): + self.assertEqual(pronounce_number( + 0.05, lang="gl-es"), "cero coma cero cinco") + self.assertEqual(pronounce_number( + -0.05, lang="gl-es"), "menos cero coma cero cinco") + self.assertEqual(pronounce_number(1.234, lang="gl-es"), + "uno coma dous tres catro") + self.assertEqual(pronounce_number(21.234, lang="gl-es"), + "vinte e un coma dous tres") + self.assertEqual(pronounce_number(21.234, lang="gl-es", places=1), + "vinte e un coma dous") + self.assertEqual(pronounce_number(21.234, lang="gl-es", places=0), + "vinte e un") + self.assertEqual(pronounce_number(21.234, lang="gl-es", places=3), + "vinte e un coma dous tres catro") + self.assertEqual(pronounce_number(21.234, lang="gl-es", places=4), + "vinte e un coma dous tres catro") + self.assertEqual(pronounce_number(21.234, lang="gl-es", places=5), + "vinte e un coma dous tres catro") + self.assertEqual(pronounce_number(-21.234, lang="gl-es"), + "menos vinte e un coma dous tres") + self.assertEqual(pronounce_number(-21.234, lang="gl-es", places=1), + "menos vinte e un coma dous") + self.assertEqual(pronounce_number(-21.234, lang="gl-es", places=0), + "menos vinte e un") + self.assertEqual(pronounce_number(-21.234, lang="gl-es", places=3), + "menos vinte e un coma dous tres catro") + self.assertEqual(pronounce_number(-21.234, lang="gl-es", places=4), + "menos vinte e un coma dous tres catro") + self.assertEqual(pronounce_number(-21.234, lang="gl-es", places=5), + "menos vinte e un coma dous tres catro") + + +class TestNiceDateFormat(unittest.TestCase): + def test_convert_times(self): + dt = datetime.datetime(2017, 1, 31, + 13, 22, 3, tzinfo=default_timezone()) + + # Verify defaults haven't changed + self.assertEqual(nice_time(dt, lang="gl-es"), + nice_time(dt, "gl-es", True, False, False)) + + self.assertEqual(nice_time(dt, lang="gl-es"), + "a unha e vinte e dous") + self.assertEqual(nice_time(dt, lang="gl-es", use_ampm=True), + "a unha e vinte e dous da tarde") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False), "1:22") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_ampm=True), "1:22 PM") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True), "13:22") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True, use_ampm=True), "13:22") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=True), "as trece vinte e dous") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=False), "as trece vinte e dous") + + dt = datetime.datetime(2017, 1, 31, + 13, 0, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es"), + "a unha en punto") + self.assertEqual(nice_time(dt, lang="gl-es", use_ampm=True), + "a unha da tarde") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False), + "1:00") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_ampm=True), "1:00 PM") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True), "13:00") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True, use_ampm=True), "13:00") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=True), "as trece cero cero") + dt = datetime.datetime(2017, 1, 31, + 13, 2, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True), + "as trece cero dous") + self.assertEqual(nice_time(dt, lang="gl-es", use_ampm=True), + "a unha e dúas de la tarde") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False), + "1:02") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_ampm=True), "1:02 PM") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True), "13:02") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True, use_ampm=True), "13:02") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=True), "as trece cero dous") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=False), "as trece cero dous") + + dt = datetime.datetime(2017, 1, 31, + 0, 2, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es"), + "as doce e dúas") + self.assertEqual(nice_time(dt, lang="gl-es", use_ampm=True), + "as doce e dúas da madrugada") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True), + "as cero cero dous") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False), + "12:02") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_ampm=True), "12:02 AM") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True), "00:02") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True, + use_ampm=True), "00:02") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=True), "as cero cero dous") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=False), "as cero cero dous") + + dt = datetime.datetime(2017, 1, 31, + 12, 15, 9, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es"), + "as doce e cuarto") + self.assertEqual(nice_time(dt, lang="gl-es", use_ampm=True), + "as doce e cuarto da mañá") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False), + "12:15") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_ampm=True), + "12:15 PM") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True), + "12:15") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True, use_ampm=True), + "12:15") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=True), + "as doce quince") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=False), + "as doce quince") + + dt = datetime.datetime(2017, 1, 31, + 19, 40, 49, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es"), + "as oito menos vinte") + self.assertEqual(nice_time(dt, lang="gl-es", use_ampm=True), + "as oito menos vinte da tarde") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False), + "7:40") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_ampm=True), + "7:40 PM") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True), + "19:40") + self.assertEqual(nice_time(dt, lang="gl-es", speech=False, + use_24hour=True, use_ampm=True), + "19:40") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=True), + "as dezanove corenta") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=False), + "as dezanove corenta") + + dt = datetime.datetime(2017, 1, 31, + 1, 15, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True), + "a unha quince") + + dt = datetime.datetime(2017, 1, 31, + 1, 35, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es"), + "as dúas menos vinte e cinco") + + dt = datetime.datetime(2017, 1, 31, + 1, 45, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es"), + "as dúas menos cuarto") + + dt = datetime.datetime(2017, 1, 31, + 4, 50, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es"), + "as cinco menos dez") + + dt = datetime.datetime(2017, 1, 31, + 5, 55, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es"), + "as seis menos cinco") + + dt = datetime.datetime(2017, 1, 31, + 5, 30, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es", use_ampm=True), + "as cinco e media da madrugada") + + dt = datetime.datetime(2017, 1, 31, + 23, 15, 9, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=True, + use_ampm=True), + "as vinte e tres quince") + self.assertEqual(nice_time(dt, lang="gl-es", use_24hour=False, + use_ampm=True), + "as once e cuarto da noche") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_parse_gl-es.py b/test/test_parse_gl-es.py new file mode 100644 index 00000000..97534b0a --- /dev/null +++ b/test/test_parse_gl-es.py @@ -0,0 +1,248 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime +import unittest + +from lingua_franca import load_language, unload_language, set_default_lang +from lingua_franca.parse import (normalize, extract_numbers, extract_number, + extract_datetime) +from lingua_franca.lang.parse_es import extract_datetime_gl-es, is_fractional_gl-es +from lingua_franca.time import default_timezone + + +def setUpModule(): + load_language('gl-es') + set_default_lang('gl-es') + + +def tearDownModule(): + unload_language('gl-es') + + +class TestNormalize(unittest.TestCase): + """ + Test cases for Galician parsing + """ + + def test_articles_gl-es(self): + self.assertEqual(normalize("esta é a proba", lang="gl-es", + remove_articles=True), + "esta es prueba") + self.assertEqual(normalize("e outra proba", lang="gl-es", + remove_articles=True), + "e outra proba") + + def test_numbers_gl-es(self): + self.assertEqual(normalize("isto é un un un", lang="gl-es"), + "isto é 1 1 1") + self.assertEqual(normalize("isto é dous tres proba", lang="gl-es"), + "isto é 2 3 proba") + self.assertEqual(normalize("isto é catro cinco seis proba", + lang="gl-es"), + "isto é 4 5 6 proba") + self.assertEqual(normalize("sete m�s oito m�s nove", lang="gl-es"), + "7 m�s 8 m�s 9") + self.assertEqual(normalize("dez once doce trece catorce quince", + lang="gl-es"), + "10 11 12 13 14 15") + self.assertEqual(normalize("dezaseis dezasete", lang="gl-es"), + "16 17") + self.assertEqual(normalize("dezaoito dezanove", lang="gl-es"), + "18 19") + self.assertEqual(normalize("vinte trinta corenta", lang="gl-es"), + "20 30 40") + self.assertEqual(normalize("trinta e dous cabalos", lang="gl-es"), + "32 cabalos") + self.assertEqual(normalize("cen cabalos", lang="gl-es"), + "100 cabalos") + self.assertEqual(normalize("cento once cabalos", lang="gl-es"), + "111 cabalos") + self.assertEqual(normalize("hab�a cuatrocentas unha vacas", + lang="gl-es"), + "hab�a 401 vacas") + self.assertEqual(normalize("dous mil", lang="gl-es"), + "2000") + self.assertEqual(normalize("dous mil trescentas corenta e cinco", + lang="gl-es"), + "2345") + self.assertEqual(normalize( + "cento vinte e tres mil catrocentas cincuenta e seis", + lang="gl-es"), + "123456") + self.assertEqual(normalize( + "cincocentas vinte e cinco mil", lang="gl-es"), + "525000") + self.assertEqual(normalize( + "novecentos noventa e nove mil novecentos noventa e nove", + lang="gl-es"), + "999999") + + def test_extract_number_es(self): + self.assertEqual(sorted(extract_numbers( + "1 7 catro catorce oito 157", lang='gl-es')), [1, 4, 7, 8, 14, 157]) + self.assertEqual(sorted(extract_numbers( + "1 7 catro albuquerque laranxa John Doe catorce oito 157", + lang='gl-es')), [1, 4, 7, 8, 14, 157]) + self.assertEqual(extract_number("seis punto dous", lang='gl-es'), 6.2) + self.assertEqual(extract_number("seis punto dous", lang='gl-es'), 6.2) + self.assertEqual(extract_number("seis coma dous", lang='gl-es'), 6.2) + self.assertEqual(extract_numbers("un medio", lang='gl-es'), [0.5]) + self.assertEqual(extract_number("cuarto", lang='gl-es'), 0.25) + + self.assertEqual(extract_number("2.0", lang='gl-es'), 2.0) + self.assertEqual(extract_number("1/4", lang='gl-es'), 0.25) + + self.assertEqual(extract_number("dous e media", lang='gl-es'), 2.5) + self.assertEqual(extract_number( + "catorce e milésima", lang='gl-es'), 14.001) + + self.assertEqual(extract_number("dous punto cero dous", lang='gl-es'), 2.02) + + def test_isFraction_es(self): + self.assertEqual(is_fractional_gl-es("vixésimo"), 1.0 / 20) + self.assertEqual(is_fractional_gl-es("vixésima"), 1.0 / 20) + self.assertEqual(is_fractional_gl-es("trixésimo"), 1.0 / 30) + self.assertEqual(is_fractional_gl-es("centésima"), 1.0 / 100) + self.assertEqual(is_fractional_gl-es("centésimo"), 1.0 / 100) + self.assertEqual(is_fractional_gl-es("milésima"), 1.0 / 1000) + + @unittest.skip("unwritten logic") + def test_comma_fraction_logic_gl-es(self): + # Logic has not been written to parse "#,#" as "#.#" + # English-style decimal numbers work because they just get float(str)ed + self.assertEqual(extract_number("2,0", lang='gl-es'), 2.0) + + +class TestDatetime_gl-es(unittest.TestCase): + + def test_datetime_by_date_gl-es(self): + # test currentDate==None + _now = datetime.now() + relative_year = _now.year if (_now.month == 1 and _now.day < 11) else \ + (_now.year + 1) + self.assertEqual(extract_datetime_gl-es("11 ene", anchorDate=_now)[0], + datetime(relative_year, 1, 11)) + + # test months + self.assertEqual(extract_datetime( + "11 xan", lang='gl-es', anchorDate=datetime(1998, 1, 1))[0], + datetime(1998, 1, 11, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "11 feb", lang='gl-es', anchorDate=datetime(1998, 2, 1))[0], + datetime(1998, 2, 11, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "11 mar", lang='gl-es', anchorDate=datetime(1998, 3, 1))[0], + datetime(1998, 3, 11, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "11 abr", lang='gl-es', anchorDate=datetime(1998, 4, 1))[0], + datetime(1998, 4, 11, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "11 mai", lang='gl-es', anchorDate=datetime(1998, 5, 1))[0], + datetime(1998, 5, 11, tzinfo=default_timezone())) + # there is an issue with the months of june through september (below) + # hai un problema cos meses desde xuño ata setembro (lea abaixo) + self.assertEqual(extract_datetime( + "11 out", lang='gl-es', anchorDate=datetime(1998, 10, 1))[0], + datetime(1998, 10, 11, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "11 nov", lang='gl-es', anchorDate=datetime(1998, 11, 1))[0], + datetime(1998, 11, 11, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "11 dec", lang='gl-es', anchorDate=datetime(1998, 12, 1))[0], + datetime(1998, 12, 11, tzinfo=default_timezone())) + + self.assertEqual(extract_datetime("", lang='gl-es'), None) + + # TODO fix bug causing these tests to fail (MycroftAI/mycroft-core#2348) + # reparar erro de tradución previndo as funcións abaixo de + # retornar correctamente + # (escrito con desculpas por un Inglés hablante) + # further broken tests are below their respective working tests. + @unittest.skip("currently processing these months incorrectly") + def test_bugged_output_wastebasket(self): + self.assertEqual(extract_datetime( + "11 xuñ", lang='gl-es', anchorDate=datetime(1998, 6, 1))[0], + datetime(1998, 6, 11, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "11 xuño", lang='gl-es', anchorDate=datetime(1998, 6, 1))[0], + datetime(1998, 6, 11, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "11 xul", lang='gl-es', anchorDate=datetime(1998, 7, 1))[0], + datetime(1998, 7, 11, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "11 ago", lang='gl-es', anchorDate=datetime(1998, 8, 1))[0], + datetime(1998, 8, 11, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "11 set", lang='gl-es', anchorDate=datetime(1998, 9, 1))[0], + datetime(1998, 9, 11, tzinfo=default_timezone())) + + # It's also failing on years + self.assertEqual(extract_datetime( + "11 ago 1998", lang='gl-es')[0], + datetime(1998, 8, 11, tzinfo=default_timezone())) + + def test_extract_datetime_relative(self): + self.assertEqual(extract_datetime( + "esta noite", anchorDate=datetime(1998, 1, 1), + lang='gl-es'), [datetime(1998, 1, 1, 21, 0, 0, tzinfo=default_timezone()), 'esta']) + self.assertEqual(extract_datetime( + "onte á noite", anchorDate=datetime(1998, 1, 1), + lang='gl-es')[0], datetime(1997, 12, 31, 21, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "a noite de antonte", anchorDate=datetime(1998, 1, 1), + lang='gl-es')[0], datetime(1997, 12, 30, 21, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "a noite de antes de antonte", anchorDate=datetime(1998, 1, 1), + lang='gl-es')[0], datetime(1997, 12, 29, 21, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "mañá pola mañá", anchorDate=datetime(1998, 1, 1), + lang='gl-es')[0], datetime(1998, 1, 2, 8, tzinfo=default_timezone())) + self.assertEqual(extract_datetime( + "onte pola tarde", anchorDate=datetime(1998, 1, 1), + lang='gl-es')[0], datetime(1997, 12, 31, 15, tzinfo=default_timezone())) + + self.assertEqual(extract_datetime("hoxe ás 2 da mañá", lang='gl-es', + anchorDate=datetime(1998, 1, 1))[0], + datetime(1998, 1, 1, 2, tzinfo=default_timezone())) + self.assertEqual(extract_datetime("hoxe ás 2 da tarde", lang='gl-es', + anchorDate=datetime(1998, 1, 1))[0], + datetime(1998, 1, 1, 14, tzinfo=default_timezone())) + + def test_extractdatetime_no_time(self): + """Check that None is returned if no time is found in sentence.""" + self.assertEqual(extract_datetime('non hai tempo', lang='gl-es'), None) + + @unittest.skip("These phrases are not parsing correctly.") + def test_extract_datetime_relative_failing(self): + # parses as "morning" and returns 8:00 on anchorDate + self.assertEqual(extract_datetime( + "mañá", anchorDate=datetime(1998, 1, 1), lang='gl-es')[0], + datetime(1998, 1, 2)) + + # unimplemented logic + self.assertEqual(extract_datetime( + "onte á noite", anchorDate=datetime(1998, 1, 1), + lang='gl-es')[0], datetime(1997, 12, 31, 21)) + self.assertEqual(extract_datetime( + "antonte á noite", anchorDate=datetime(1998, 1, 1), + lang='gl-es')[0], datetime(1997, 12, 30, 21)) + self.assertEqual(extract_datetime( + "fai tres noites", anchorDate=datetime(1998, 1, 1), + lang='gl-es')[0], datetime(1997, 12, 29, 21)) + + +if __name__ == "__main__": + unittest.main()