From c61ac3dcdd3d4e843a1056dc3f3332da56bbedd6 Mon Sep 17 00:00:00 2001 From: Angel Docampo Date: Thu, 17 Oct 2019 01:27:05 +0200 Subject: [PATCH 1/4] modified common_data_es.py to resemble PT --- lingua_franca/lang/common_data_es.py | 135 ++++++++++++++++++++++----- 1 file changed, 112 insertions(+), 23 deletions(-) diff --git a/lingua_franca/lang/common_data_es.py b/lingua_franca/lang/common_data_es.py index a7ccb739..f8662839 100644 --- a/lingua_franca/lang/common_data_es.py +++ b/lingua_franca/lang/common_data_es.py @@ -19,18 +19,40 @@ from collections import OrderedDict +# Undefined articles ["un", "unos", "una", "unas"] can not be supressed, +# in ES, "un cavallo" means "a horse" or "one horse". _ARTICLES_ES = {'el', 'la', 'los', 'las'} +# word rules for gender +# TODO: review rules to see exceptions +_FEMALE_ENDINGS_ES = ["a", "as"] +_MALE_ENDINGS_ES = ["o", "os"] -_NUM_STRING_ES = { +# special cases, word lookup for words not covered by above rule +_GENDERS_ES = { + "mujer": "f", + "mujeres": "f", + "madre": "f", + "hombre": "m", + "padre": "m" +} + +# context rules for gender +_MALE_DETERMINANTS_ES = ["lo", "los", "este", "estos", "ese", "esos"] +_FEMALE_DETERMINANTS_ES = ["la", "las", "esta", "estas", "esa", "esas"] + + +_NUMBERS_ES = { "cero": 0, "un": 1, "uno": 1, "una": 1, + "primero": 1, + "segundo": 2, + "tercero": 3, "dos": 2, "tres": 3, - u"trés": 3, "cuatro": 4, "cinco": 5, "seis": 6, @@ -43,28 +65,23 @@ "trece": 13, "catorce": 14, "quince": 15, - "dieciseis": 16, - u"dieciséis": 16, + "dieciséis": 16, "diecisiete": 17, "dieciocho": 18, "diecinueve": 19, "veinte": 20, "veintiuno": 21, - u"veintid�s": 22, - u"veintitr�s": 23, - "veintidos": 22, - "veintitres": 23, - u"veintitrés": 23, + "veintidós": 22, + "veintitrés": 23, "veinticuatro": 24, "veinticinco": 25, - u"veintiséis": 26, - "veintiseis": 26, + "veintiséis": 26, "veintisiete": 27, "veintiocho": 28, "veintinueve": 29, "treinta": 30, "cuarenta": 40, - "cincuenta": 50, + "cinquenta": 50, "sesenta": 60, "setenta": 70, "ochenta": 80, @@ -73,8 +90,8 @@ "ciento": 100, "doscientos": 200, "doscientas": 200, - "trescientos": 300, - "trescientas": 300, + "trecientos": 300, + "trecientas": 300, "cuatrocientos": 400, "cuatrocientas": 400, "quinientos": 500, @@ -87,9 +104,18 @@ "ochocientas": 800, "novecientos": 900, "novecientas": 900, - "mil": 1000} - + "mil": 1000 + } +# Fractions can be noun (e.g: 1/2, 3/4) or adjectives (e.g.: 1/4 part of +# something). As a noun is commonly expressed as masculine while when is an +# adjective is always femenine +# https://espanol.lingolia.com/es/vocabulario/numeros-fechas-horas/fracciones +# Because the femenine particle (adjective) can be extrapolated thanks to the +# article, we put here just the noun. +# As a noun, fractions for décimas, centésimas, milésimas, cienmilésimas, etc +# are expressed in femenine in Spain, while in some Latam countries are +# expressed in masculine. _FRACTION_STRING_ES = { 2: 'medio', 3: 'tercio', @@ -109,9 +135,72 @@ 17: 'diecisieteavo', 18: 'dieciochoavo', 19: 'diecinueveavo', - 20: 'veinteavo' + 20: 'veinteavo', + 30: 'treintavo', + 100: 'centésima', + 1000: 'milésima' } + +_NUM_STRING_ES = { + 0: 'cero', + 1: 'uno', + 2: 'dos', + 3: 'tres', + 4: 'cuatro', + 5: 'cinco', + 6: 'seis', + 7: 'siete', + 8: 'ocho', + 9: 'nueve', + 10: 'diez', + 11: 'once', + 12: 'doce', + 13: 'trece', + 14: 'catorce', + 15: 'quince', + 16: 'dieciséis', + 17: 'diecisete', + 18: 'dieciocho', + 19: 'diecinueve', + 20: 'veinte', + 21: 'veintiuno', + 22: 'veintidos', + 23: 'veintitres', + 24: 'veinticuatro', + 25: 'veinticinco', + 26: 'veintiséis', + 27: 'veintisiete', + 28: 'veintiocho', + 29: 'veintinueve', + 30: 'treinta', + 40: 'cuarenta', + 50: 'cincuenta', + 60: 'sesenta', + 70: 'setenta', + 80: 'ochenta', + 90: 'noventa' +} + +# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) +_FRACTION_MARKER_ES = {"y"} +# for non english speakers, "X Y avos" means X / Y , Y must be > 10 +_SUFFIX_FRACTION_MARKER_ES = {"avos"} +# mean you should sum the next number , equivalent to "and", i.e, "two thousand and one" +# WARNING: In spanish we use this ONLY between tens and units (i.e. treinta y seis) +_SUM_MARKER_ES = {"y"} +# decimal marker ( 1 point 5 = 1 + 0.5) +# WARNING: In proper Spanish, the only valid term is "coma", we put here "punto" for +# compatibility, but is wrong, as the decimals are written with comma, so even when +# we read loud a text in english like 0.5 we said "cero coma cinco" +_DECIMAL_MARKER_ES = {"coma", "punto"} + +# negate next number (-2 = 0 - 2) +_NEGATIVES_ES = {"menos"} +# negate previous number, "2 negative" -> -2 +_NEGATIVE_SUFFIX_MARKER_ES = {"negativo", "negativos"} + +# Long scale is the default scale on Spain # https://www.grobauer.at/es_eur/zahlnamen.php _LONG_SCALE_ES = OrderedDict([ (100, 'centena'), @@ -143,7 +232,7 @@ (1e366, "unsexagintillón") ]) - +# Short scale is the default scale on America _SHORT_SCALE_ES = OrderedDict([ (100, 'centena'), (1000, 'millar'), @@ -219,7 +308,7 @@ ]) # TODO: female forms. -_ORDINAL_STRING_BASE_ES = { +_ORDINAL_BASE_ES = { 1: 'primero', 2: 'segundo', 3: 'tercero', @@ -252,7 +341,7 @@ } -_SHORT_ORDINAL_STRING_ES = { +_SHORT_ORDINAL_ES = { 1e6: "millonésimo", 1e9: "milmillonésimo", 1e12: "billonésimo", @@ -265,10 +354,10 @@ 1e33: "milquintillonésimo" # TODO > 1e-33 } -_SHORT_ORDINAL_STRING_ES.update(_ORDINAL_STRING_BASE_ES) +_SHORT_ORDINAL_ES.update(_ORDINAL_BASE_ES) -_LONG_ORDINAL_STRING_ES = { +_LONG_ORDINAL_ES = { 1e6: "millonésimo", 1e12: "billionth", 1e18: "trillonésimo", @@ -281,4 +370,4 @@ 1e60: "decillonésimo" # TODO > 1e60 } -_LONG_ORDINAL_STRING_ES.update(_ORDINAL_STRING_BASE_ES) +_LONG_ORDINAL_ES.update(_ORDINAL_BASE_ES) From 12fccf2b32d54e1bacace910ccb6209a2a423757 Mon Sep 17 00:00:00 2001 From: Angel Docampo Date: Thu, 17 Oct 2019 01:34:57 +0200 Subject: [PATCH 2/4] added more exceptions to gender --- lingua_franca/lang/common_data_es.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lingua_franca/lang/common_data_es.py b/lingua_franca/lang/common_data_es.py index f8662839..54893d10 100644 --- a/lingua_franca/lang/common_data_es.py +++ b/lingua_franca/lang/common_data_es.py @@ -35,7 +35,26 @@ "mujeres": "f", "madre": "f", "hombre": "m", - "padre": "m" + "padre": "m", + "mapa": "m", + "poema": "m", + "problema": "m", + "día": "m", + "moto": "f", + "radio": "f", + "mano": "f", + "foto": "f", + "amor": "m", + "corazón": "m", + "árbol": "m", + "canción": "f", + "ciudad": "f", + "flor": "f", + "jefe": "m", + "café": "m", + "baile": "m", + "gente": "f", + "serie": "f" } # context rules for gender From 73c27fc032aa8506676f520a3f92ed401364276f Mon Sep 17 00:00:00 2001 From: Angel Docampo Date: Thu, 17 Oct 2019 17:56:03 +0200 Subject: [PATCH 3/4] tmp commit --- lingua_franca/lang/parse_es copy.py | 1068 +++++++++++++++++++++++++++ lingua_franca/lang/parse_es.py | 494 ++++++++----- 2 files changed, 1380 insertions(+), 182 deletions(-) create mode 100644 lingua_franca/lang/parse_es copy.py diff --git a/lingua_franca/lang/parse_es copy.py b/lingua_franca/lang/parse_es copy.py new file mode 100644 index 00000000..bebda23d --- /dev/null +++ b/lingua_franca/lang/parse_es copy.py @@ -0,0 +1,1068 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" + Parse functions for spanish (es) + TODO: numbers greater than 999999 +""" +from datetime import datetime +from dateutil.relativedelta import relativedelta +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions +from lingua_franca.lang.common_data_es import _ARTICLES_ES, _NUM_STRING_ES + + +def isFractional_es(input_str): + """ + This function takes the given text and checks if it is a fraction. + + Args: + text (str): the string to check if fractional + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + + aFrac = ["medio", "media", "tercio", "cuarto", "cuarta", "quinto", + "quinta", "sexto", "sexta", u"séptimo", u"séptima", "octavo", + "octava", "noveno", "novena", u"décimo", u"décima", u"onceavo", + u"onceava", u"doceavo", u"doceava"] + + if input_str.lower() in aFrac: + return 1.0 / (aFrac.index(input_str) + 2) + if (input_str == "cuarto" or input_str == "cuarta"): + return 1.0 / 4 + if (input_str == u"vigésimo" or input_str == u"vigésima"): + return 1.0 / 20 + if (input_str == u"trigésimo" or input_str == u"trigésima"): + return 1.0 / 30 + if (input_str == u"centésimo" or input_str == u"centésima"): + return 1.0 / 100 + if (input_str == u"milésimo" or input_str == u"milésima"): + return 1.0 / 1000 + return False + + +def extractnumber_es(text): + """ + This function prepares the given text for parsing by making + numbers consistent, getting rid of contractions, etc. + Args: + text (str): the string to normalize + Returns: + (int) or (float): The value of extracted number + + """ + aWords = text.split() + count = 0 + result = None + while count < len(aWords): + val = 0 + word = aWords[count] + next_next_word = None + if count + 1 < len(aWords): + next_word = aWords[count + 1] + if count + 2 < len(aWords): + next_next_word = aWords[count + 2] + else: + next_word = None + + # is current word a number? + if word in _NUM_STRING_ES: + val = _NUM_STRING_ES[word] + elif word.isdigit(): # doesn't work with decimals + val = int(word) + elif is_numeric(word): + val = float(word) + elif isFractional_es(word): + if not result: + result = 1 + result = result * isFractional_es(word) + count += 1 + continue + + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + # if (len(aPieces) == 2 and is_numeric(aPieces[0]) + # and is_numeric(aPieces[1])): + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + + if val: + if result is None: + result = 0 + # handle fractions + if next_word != "avos": + result += val + else: + result = float(result) / float(val) + + if next_word is None: + break + + # number word and fraction + ands = ["e"] + if next_word in ands: + zeros = 0 + if result is None: + count += 1 + continue + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + + afterAndVal = extractnumber_es(newText[:-1]) + if afterAndVal: + if result < afterAndVal or result < 20: + while afterAndVal > 1: + afterAndVal = afterAndVal / 10.0 + for word in newWords: + if word == "cero" or word == "0": + zeros += 1 + else: + break + for _ in range(0, zeros): + afterAndVal = afterAndVal / 10.0 + result += afterAndVal + break + elif next_next_word is not None: + if next_next_word in ands: + newWords = aWords[count + 3:] + newText = "" + for word in newWords: + newText += word + " " + afterAndVal = extractnumber_es(newText[:-1]) + if afterAndVal: + if result is None: + result = 0 + result += afterAndVal + break + + decimals = ["punto", "coma", ".", ","] + if next_word in decimals: + zeros = 0 + newWords = aWords[count + 2:] + newText = "" + for word in newWords: + newText += word + " " + for word in newWords: + if word == "cero" or word == "0": + zeros += 1 + else: + break + afterDotVal = str(extractnumber_es(newText[:-1])) + afterDotVal = zeros * "0" + afterDotVal + result = float(str(result) + "." + afterDotVal) + break + count += 1 + + if result is None: + return False + + # Return the $str with the number related words removed + # (now empty strings, so strlen == 0) + # aWords = [word for word in aWords if len(word) > 0] + # text = ' '.join(aWords) + if "." in str(result): + integer, dec = str(result).split(".") + # cast float to int + if dec == "0": + result = int(integer) + + return result + + +def es_number_parse(words, i): + def es_cte(i, s): + if i < len(words) and s == words[i]: + return s, i + 1 + return None + + def es_number_word(i, mi, ma): + if i < len(words): + v = _NUM_STRING_ES.get(words[i]) + if v and v >= mi and v <= ma: + return v, i + 1 + return None + + def es_number_1_99(i): + r1 = es_number_word(i, 1, 29) + if r1: + return r1 + + r1 = es_number_word(i, 30, 90) + if r1: + v1, i1 = r1 + r2 = es_cte(i1, "y") + if r2: + i2 = r2[1] + r3 = es_number_word(i2, 1, 9) + if r3: + v3, i3 = r3 + return v1 + v3, i3 + return r1 + return None + + def es_number_1_999(i): + # [2-9]cientos [1-99]? + r1 = es_number_word(i, 100, 900) + if r1: + v1, i1 = r1 + r2 = es_number_1_99(i1) + if r2: + v2, i2 = r2 + return v1 + v2, i2 + else: + return r1 + + # [1-99] + r1 = es_number_1_99(i) + if r1: + return r1 + + return None + + def es_number(i): + # check for cero + r1 = es_number_word(i, 0, 0) + if r1: + return r1 + + # check for [1-999] (mil [0-999])? + r1 = es_number_1_999(i) + if r1: + v1, i1 = r1 + r2 = es_cte(i1, "mil") + if r2: + i2 = r2[1] + r3 = es_number_1_999(i2) + if r3: + v3, i3 = r3 + return v1 * 1000 + v3, i3 + else: + return v1 * 1000, i2 + else: + return r1 + return None + + return es_number(i) + + +def normalize_es(text, remove_articles): + """ Spanish string normalization """ + + words = text.split() # this also removed extra spaces + + normalized = "" + i = 0 + while i < len(words): + word = words[i] + + if remove_articles and word in _ARTICLES_ES: + i += 1 + continue + + # Convert numbers into digits + r = es_number_parse(words, i) + if r: + v, i = r + normalized += " " + str(v) + continue + + normalized += " " + word + i += 1 + + return normalized[1:] # strip the initial space + + +def extract_datetime_es(input_str, currentDate=None, default_time=None): + def clean_string(s): + # cleans the input string of unneeded punctuation and capitalization + # among other things + symbols = [".", ",", ";", "?", "!", u"º", u"ª"] + noise_words = ["entre", "la", "del", "al", "el", "de", + "por", "para", "una", "cualquier", "a", + "e'", "esta", "este"] + + for word in symbols: + s = s.replace(word, "") + for word in noise_words: + s = s.replace(" " + word + " ", " ") + s = s.lower().replace( + u"á", + "a").replace( + u"é", + "e").replace( + u"ó", + "o").replace( + "-", + " ").replace( + "_", + "") + # handle synonims and equivalents, "tomorrow early = tomorrow morning + synonims = {u"mañana": ["amanecer", "temprano", "muy temprano"], + "tarde": ["media tarde", "atardecer"], + "noche": ["anochecer", "tarde"]} + for syn in synonims: + for word in synonims[syn]: + s = s.replace(" " + word + " ", " " + syn + " ") + # relevant plurals, cant just extract all s in pt + wordlist = [u"mañanas", "tardes", "noches", u"días", "semanas", + u"años", "minutos", "segundos", "las", "los", "siguientes", + u"próximas", u"próximos", "horas"] + for _, word in enumerate(wordlist): + s = s.replace(word, word.rstrip('s')) + s = s.replace("meses", "mes").replace("anteriores", "anterior") + return s + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if input_str == "": + return None + if currentDate is None: + currentDate = datetime.now() + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = currentDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + words = clean_string(input_str).split(" ") + timeQualifiersList = [u'mañana', 'tarde', 'noche'] + time_indicators = ["en", "la", "al", "por", "pasados", + "pasadas", u"día", "hora"] + days = ['lunes', 'martes', u'miércoles', + 'jueves', 'viernes', u'sábado', 'domingo'] + months = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', + 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', + 'diciembre'] + monthsShort = ['ene', 'feb', 'mar', 'abr', 'may', 'jun', 'jul', 'ago', + 'sep', 'oct', 'nov', 'dic'] + nexts = ["siguiente", u"próximo", u"próxima"] + suffix_nexts = ["siguientes", "subsecuentes"] + lasts = [u"último", u"última"] + suffix_lasts = ["pasada", "pasado", "anterior", "antes"] + nxts = [u"después", "siguiente", u"próximo", u"próxima"] + prevs = ["antes", "previa", "previo", "anterior"] + froms = ["desde", "en", "para", u"después de", "por", u"próximo", + u"próxima", "de"] + thises = ["este", "esta"] + froms += thises + lists = nxts + prevs + froms + time_indicators + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + + # parse today, tomorrow, yesterday + elif word == "hoy" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == u"mañana" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "ayer" and not fromFlag: + dayOffset -= 1 + used += 1 + # "before yesterday" and "before before yesterday" + elif (word == "anteayer" or + (word == "ante" and wordNext == "ayer")) and not fromFlag: + dayOffset -= 2 + used += 1 + if wordNext == "ayer": + used += 1 + elif word == "ante" and wordNext == "ante" and wordNextNext == \ + "ayer" and not fromFlag: + dayOffset -= 3 + used += 3 + elif word == "ante anteayer" and not fromFlag: + dayOffset -= 3 + used += 1 + # day after tomorrow + elif word == "pasado" and wordNext == u"mañana" and not fromFlag: + dayOffset += 2 + used = 2 + # day before yesterday + elif word == "ante" and wordNext == "ayer" and not fromFlag: + dayOffset -= 2 + used = 2 + # parse 5 days, 10 weeks, last week, next week, week after + elif word == u"día": + if wordNext == "pasado" or wordNext == "ante": + used += 1 + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used += 1 + elif (wordPrev and wordPrev[0].isdigit() and + wordNext not in months and + wordNext not in monthsShort): + dayOffset += int(wordPrev) + start -= 1 + used += 2 + elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ + months and wordNextNext not in monthsShort: + dayOffset += int(wordNext) + start -= 1 + used += 2 + + elif word == "semana" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + dayOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "mes" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + monthOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + monthOffset = -7 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == u"año" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + yearOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + yearOffset = -7 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "siguiente": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "pasado": + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "siguiente": + # dayOffset += 7 + used += 1 + elif wordNext == "pasado": + # dayOffset -= 7 + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + # 13 mayo + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + # mayo 13 + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordPrevPrev and wordPrevPrev[0].isdigit(): + # 13 dia mayo + datestr += " " + wordPrevPrev + + start -= 2 + used += 2 + if wordNext and word[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNextNext and wordNextNext[0].isdigit(): + # mayo dia 13 + datestr += " " + wordNextNext + used += 2 + if wordNextNextNext and wordNextNextNext[0].isdigit(): + datestr += " " + wordNextNextNext + used += 1 + hasYear = True + else: + hasYear = False + + if datestr in months: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("hoy") + validFollowups.append(u"mañana") + validFollowups.append("ayer") + validFollowups.append("anteayer") + validFollowups.append("ahora") + validFollowups.append("ya") + validFollowups.append("ante") + + # TODO debug word "depois" that one is failing for some reason + if word in froms and wordNext in validFollowups: + + if not (wordNext == u"mañana" and wordNext == "ayer") and not ( + word == "pasado" or word == "antes"): + used = 2 + fromFlag = True + if wordNext == u"mañana" and word != "pasado": + dayOffset += 1 + elif wordNext == "ayer": + dayOffset -= 1 + elif wordNext == "anteayer": + dayOffset -= 2 + elif wordNext == "ante" and wordNextNext == "ayer": + dayOffset -= 2 + elif (wordNext == "ante" and wordNext == "ante" and + wordNextNextNext == "ayer"): + dayOffset -= 3 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + # if wordNextNext == "feira": + # used += 1 + if tmpOffset < 0: + tmpOffset += 7 + if wordNextNext: + if wordNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNextNextNext: + if wordNextNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + # if wordNextNextNext == "feira": + # used += 1 + if wordNext in months: + used -= 1 + if used > 0: + + if start - 1 > 0 and words[start - 1] in lists: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in lists: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "medio" and wordNext == u"día": + hrAbs = 12 + used += 2 + elif word == "media" and wordNext == "noche": + hrAbs = 0 + used += 2 + elif word == u"mañana": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "tarde": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word == "media" and wordNext == "tarde": + if not hrAbs: + hrAbs = 17 + used += 2 + elif word == "tarde" and wordNext == "noche": + if not hrAbs: + hrAbs = 20 + used += 2 + elif word == "media" and wordNext == u"mañana": + if not hrAbs: + hrAbs = 10 + used += 2 + # elif word == "fim" and wordNext == "tarde": + # if not hrAbs: + # hrAbs = 19 + # used += 2 + # elif word == "fim" and wordNext == "manha": + # if not hrAbs: + # hrAbs = 11 + # used += 2 + elif word == "madrugada": + if not hrAbs: + hrAbs = 1 + used += 2 + elif word == "noche": + if not hrAbs: + hrAbs = 21 + used += 1 + # parse half an hour, quarter hour + elif word == "hora" and \ + (wordPrev in time_indicators or wordPrevPrev in + time_indicators): + if wordPrev == "media": + minOffset = 30 + elif wordPrev == "cuarto": + minOffset = 15 + elif wordPrevPrev == "cuarto": + minOffset = 15 + if idx > 2 and words[idx - 3] in time_indicators: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in time_indicators: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif wordNext == u"mañana" or wordNext == "madrugada": + remainder = "am" + used += 1 + elif wordNext == "tarde": + remainder = "pm" + used += 1 + elif wordNext == "noche": + if 0 < int(word[0]) < 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + elif wordNext in thises and wordNextNext == u"mañana": + remainder = "am" + used = 2 + elif wordNext in thises and wordNextNext == "tarde": + remainder = "pm" + used = 2 + elif wordNext in thises and wordNextNext == "noche": + remainder = "pm" + used = 2 + else: + if timeQualifier != "": + if strHH <= 12 and \ + (timeQualifier == u"mañana" or + timeQualifier == "tarde"): + strHH += 12 + + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if (wordNext == "pm" or + wordNext == "p.m." or + wordNext == "tarde"): + strHH = strNum + remainder = "pm" + used = 1 + elif (wordNext == "am" or + wordNext == "a.m." or + wordNext == u"mañana"): + strHH = strNum + remainder = "am" + used = 1 + elif (int(word) > 100 and + ( + # wordPrev == "o" or + # wordPrev == "oh" or + wordPrev == "cero" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hora": + used += 1 + elif ( + wordNext == "hora" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minuto": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "segundo": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hora": + used += 1 + + elif wordNext == "" or ( + wordNext == "en" and wordNextNext == "punto"): + strHH = word + strMM = 00 + if wordNext == "en" and wordNextNext == "punto": + used += 2 + if wordNextNextNext == "tarde": + remainder = "pm" + used += 1 + elif wordNextNextNext == u"mañana": + remainder = "am" + used += 1 + elif wordNextNextNext == "noche": + if 0 > strHH > 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + used += 1 + if wordNextNext == "hora": + used += 1 + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if (remainder == "pm" and + 0 < strHH < 12) else strHH + strHH = strHH - 12 if (remainder == "am" and + 0 < strHH >= 12) else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "en" or wordPrev == "punto": + words[words.index(wordPrev)] = "" + + if idx > 0 and wordPrev in time_indicators: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in time_indicators: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found: + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + + if hrAbs != -1 and minAbs != -1: + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + # resultStr = pt_pruning(resultStr) + return [extractedDate, resultStr] + + +def get_gender_es(word, raw_string=""): + # Next rules are imprecise and incompleted, but is a good starting point. + # For more detailed explanation, see + # http://www.wikilengua.org/index.php/Género_gramatical + word = word.rstrip("s") + gender = False + words = raw_string.split(" ") + for idx, w in enumerate(words): + if w == word and idx != 0: + previous = words[idx - 1] + gender = get_gender_es(previous) + break + if not gender: + if word[-1] == "a": + gender = "f" + if word[-1] == "o" or word[-1] == "e": + gender = "m" + return gender diff --git a/lingua_franca/lang/parse_es.py b/lingua_franca/lang/parse_es.py index bebda23d..499c356c 100644 --- a/lingua_franca/lang/parse_es.py +++ b/lingua_franca/lang/parse_es.py @@ -15,16 +15,21 @@ # limitations under the License. # """ - Parse functions for spanish (es) + Parse functions for Spanish (es_ES) + TODO: numbers greater than 999999 + TODO: date time es """ + from datetime import datetime from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions -from lingua_franca.lang.common_data_es import _ARTICLES_ES, _NUM_STRING_ES +from lingua_franca.lang.common_data_es import _FRACTION_STRING_ES, \ + _ARTICLES_ES, _NUMBERS_ES, _FEMALE_DETERMINANTS_ES, _FEMALE_ENDINGS_ES,\ + _MALE_DETERMINANTS_ES, _MALE_ENDINGS_ES, _GENDERS_ES -def isFractional_es(input_str): +def isFractional_pt(input_str): """ This function takes the given text and checks if it is a fraction. @@ -37,27 +42,27 @@ def isFractional_es(input_str): if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "fifths" - aFrac = ["medio", "media", "tercio", "cuarto", "cuarta", "quinto", - "quinta", "sexto", "sexta", u"séptimo", u"séptima", "octavo", - "octava", "noveno", "novena", u"décimo", u"décima", u"onceavo", - u"onceava", u"doceavo", u"doceava"] + aFrac = ["meio", u"terço", "quarto", "quinto", "sexto", + "setimo", "oitavo", "nono", u"décimo"] if input_str.lower() in aFrac: return 1.0 / (aFrac.index(input_str) + 2) - if (input_str == "cuarto" or input_str == "cuarta"): - return 1.0 / 4 - if (input_str == u"vigésimo" or input_str == u"vigésima"): + if input_str == u"vigésimo": return 1.0 / 20 - if (input_str == u"trigésimo" or input_str == u"trigésima"): + if input_str == u"trigésimo": return 1.0 / 30 - if (input_str == u"centésimo" or input_str == u"centésima"): + if input_str == u"centésimo": return 1.0 / 100 - if (input_str == u"milésimo" or input_str == u"milésima"): + if input_str == u"milésimo": return 1.0 / 1000 + if (input_str == u"sétimo" or input_str == "septimo" or + input_str == u"séptimo"): + return 1.0 / 7 + return False -def extractnumber_es(text): +def extractnumber_pt(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. @@ -82,16 +87,16 @@ def extractnumber_es(text): next_word = None # is current word a number? - if word in _NUM_STRING_ES: - val = _NUM_STRING_ES[word] + if word in _NUMBERS_ES: + val = _NUMBERS_ES[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) - elif isFractional_es(word): + elif isFractional_pt(word): if not result: result = 1 - result = result * isFractional_es(word) + result = result * isFractional_pt(word) count += 1 continue @@ -127,13 +132,13 @@ def extractnumber_es(text): for word in newWords: newText += word + " " - afterAndVal = extractnumber_es(newText[:-1]) + afterAndVal = extractnumber_pt(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: - if word == "cero" or word == "0": + if word == "zero" or word == "0": zeros += 1 else: break @@ -147,14 +152,14 @@ def extractnumber_es(text): newText = "" for word in newWords: newText += word + " " - afterAndVal = extractnumber_es(newText[:-1]) + afterAndVal = extractnumber_pt(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break - decimals = ["punto", "coma", ".", ","] + decimals = ["ponto", "virgula", "vírgula", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] @@ -162,11 +167,11 @@ def extractnumber_es(text): for word in newWords: newText += word + " " for word in newWords: - if word == "cero" or word == "0": + if word == "zero" or word == "0": zeros += 1 else: break - afterDotVal = str(extractnumber_es(newText[:-1])) + afterDotVal = str(extractnumber_pt(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break @@ -188,43 +193,43 @@ def extractnumber_es(text): return result -def es_number_parse(words, i): - def es_cte(i, s): +def pt_number_parse(words, i): + def pt_cte(i, s): if i < len(words) and s == words[i]: return s, i + 1 return None - def es_number_word(i, mi, ma): + def pt_number_word(i, mi, ma): if i < len(words): - v = _NUM_STRING_ES.get(words[i]) + v = _NUMBERS_ES.get(words[i]) if v and v >= mi and v <= ma: return v, i + 1 return None - def es_number_1_99(i): - r1 = es_number_word(i, 1, 29) + def pt_number_1_99(i): + r1 = pt_number_word(i, 1, 29) if r1: return r1 - r1 = es_number_word(i, 30, 90) + r1 = pt_number_word(i, 30, 90) if r1: v1, i1 = r1 - r2 = es_cte(i1, "y") + r2 = pt_cte(i1, "e") if r2: i2 = r2[1] - r3 = es_number_word(i2, 1, 9) + r3 = pt_number_word(i2, 1, 9) if r3: v3, i3 = r3 return v1 + v3, i3 return r1 return None - def es_number_1_999(i): + def pt_number_1_999(i): # [2-9]cientos [1-99]? - r1 = es_number_word(i, 100, 900) + r1 = pt_number_word(i, 100, 900) if r1: v1, i1 = r1 - r2 = es_number_1_99(i1) + r2 = pt_number_1_99(i1) if r2: v2, i2 = r2 return v1 + v2, i2 @@ -232,26 +237,26 @@ def es_number_1_999(i): return r1 # [1-99] - r1 = es_number_1_99(i) + r1 = pt_number_1_99(i) if r1: return r1 return None - def es_number(i): + def pt_number(i): # check for cero - r1 = es_number_word(i, 0, 0) + r1 = pt_number_word(i, 0, 0) if r1: return r1 # check for [1-999] (mil [0-999])? - r1 = es_number_1_999(i) + r1 = pt_number_1_999(i) if r1: v1, i1 = r1 - r2 = es_cte(i1, "mil") + r2 = pt_cte(i1, "mil") if r2: i2 = r2[1] - r3 = es_number_1_999(i2) + r3 = pt_number_1_999(i2) if r3: v3, i3 = r3 return v1 * 1000 + v3, i3 @@ -261,44 +266,54 @@ def es_number(i): return r1 return None - return es_number(i) + return pt_number(i) -def normalize_es(text, remove_articles): - """ Spanish string normalization """ +def normalize_pt(text, remove_articles): + """ PT string normalization """ words = text.split() # this also removed extra spaces + normalized = "" + # Contractions are not common in PT + # Convert numbers into digits, e.g. "dois" -> "2" normalized = "" i = 0 while i < len(words): word = words[i] - + # remove articles if remove_articles and word in _ARTICLES_ES: i += 1 continue # Convert numbers into digits - r = es_number_parse(words, i) + r = pt_number_parse(words, i) if r: v, i = r normalized += " " + str(v) continue + # NOTE temporary , handle some numbers above >999 + if word in _NUMBERS_ES: + word = str(_NUMBERS_ES[word]) + # end temporary + normalized += " " + word i += 1 + # some articles in pt-pt can not be removed, but many words can + # this is experimental and some meaning may be lost + # maybe agressive should default to False + # only usage will tell, as a native speaker this seems reasonable + return pt_pruning(normalized[1:], agressive=remove_articles) - return normalized[1:] # strip the initial space - -def extract_datetime_es(input_str, currentDate=None, default_time=None): +def extract_datetime_pt(input_str, currentDate, default_time): def clean_string(s): # cleans the input string of unneeded punctuation and capitalization # among other things symbols = [".", ",", ";", "?", "!", u"º", u"ª"] - noise_words = ["entre", "la", "del", "al", "el", "de", - "por", "para", "una", "cualquier", "a", - "e'", "esta", "este"] + noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de", + "ao", "aos"] for word in symbols: s = s.replace(word, "") @@ -307,25 +322,39 @@ def clean_string(s): s = s.lower().replace( u"á", "a").replace( + u"ç", + "c").replace( + u"à", + "a").replace( + u"ã", + "a").replace( u"é", "e").replace( + u"è", + "e").replace( + u"ê", + "e").replace( u"ó", "o").replace( + u"ò", + "o").replace( "-", " ").replace( "_", "") # handle synonims and equivalents, "tomorrow early = tomorrow morning - synonims = {u"mañana": ["amanecer", "temprano", "muy temprano"], - "tarde": ["media tarde", "atardecer"], - "noche": ["anochecer", "tarde"]} + synonims = {"manha": ["manhazinha", "cedo", "cedinho"], + "tarde": ["tardinha", "tarde"], + "noite": ["noitinha", "anoitecer"], + "todos": ["ao", "aos"], + "em": ["do", "da", "dos", "das", "de"]} for syn in synonims: for word in synonims[syn]: s = s.replace(" " + word + " ", " " + syn + " ") # relevant plurals, cant just extract all s in pt - wordlist = [u"mañanas", "tardes", "noches", u"días", "semanas", - u"años", "minutos", "segundos", "las", "los", "siguientes", - u"próximas", u"próximos", "horas"] + wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos", + "minutos", "segundos", "nas", "nos", "proximas", + "seguintes", "horas"] for _, word in enumerate(wordlist): s = s.replace(word, word.rstrip('s')) s = s.replace("meses", "mes").replace("anteriores", "anterior") @@ -334,17 +363,15 @@ def clean_string(s): def date_found(): return found or \ ( - datestr != "" or + datestr != "" or timeStr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) - if input_str == "": + if input_str == "" or not currentDate: return None - if currentDate is None: - currentDate = datetime.now() found = False daySpecified = False @@ -360,25 +387,26 @@ def date_found(): timeQualifier = "" words = clean_string(input_str).split(" ") - timeQualifiersList = [u'mañana', 'tarde', 'noche'] - time_indicators = ["en", "la", "al", "por", "pasados", - "pasadas", u"día", "hora"] - days = ['lunes', 'martes', u'miércoles', - 'jueves', 'viernes', u'sábado', 'domingo'] - months = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', - 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', - 'diciembre'] - monthsShort = ['ene', 'feb', 'mar', 'abr', 'may', 'jun', 'jul', 'ago', - 'sep', 'oct', 'nov', 'dic'] - nexts = ["siguiente", u"próximo", u"próxima"] - suffix_nexts = ["siguientes", "subsecuentes"] - lasts = [u"último", u"última"] - suffix_lasts = ["pasada", "pasado", "anterior", "antes"] - nxts = [u"después", "siguiente", u"próximo", u"próxima"] - prevs = ["antes", "previa", "previo", "anterior"] - froms = ["desde", "en", "para", u"después de", "por", u"próximo", - u"próxima", "de"] - thises = ["este", "esta"] + timeQualifiersList = ['manha', 'tarde', 'noite'] + time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas", + "no", "dia", "hora"] + days = ['segunda', 'terca', 'quarta', + 'quinta', 'sexta', 'sabado', 'domingo'] + months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho', + 'julho', 'agosto', 'setembro', 'outubro', 'novembro', + 'dezembro'] + monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag', + 'set', 'out', 'nov', 'dec'] + nexts = ["proximo", "proxima"] + suffix_nexts = ["seguinte", "subsequente", "seguir"] + lasts = ["ultimo", "ultima"] + suffix_lasts = ["passada", "passado", "anterior", "antes"] + nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"] + prevs = ["antes", "ante", "previa", "previamente", "anterior"] + froms = ["partir", "em", "para", "na", "no", "daqui", "seguir", + "depois", "por", "proxima", "proximo", "da", "do", "de"] + thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse", + "nessa"] froms += thises lists = nxts + prevs + froms + time_indicators for idx, word in enumerate(words): @@ -397,40 +425,40 @@ def date_found(): timeQualifier = word # parse today, tomorrow, yesterday - elif word == "hoy" and not fromFlag: + elif word == "hoje" and not fromFlag: dayOffset = 0 used += 1 - elif word == u"mañana" and not fromFlag: + elif word == "amanha" and not fromFlag: dayOffset = 1 used += 1 - elif word == "ayer" and not fromFlag: + elif word == "ontem" and not fromFlag: dayOffset -= 1 used += 1 # "before yesterday" and "before before yesterday" - elif (word == "anteayer" or - (word == "ante" and wordNext == "ayer")) and not fromFlag: + elif (word == "anteontem" or + (word == "ante" and wordNext == "ontem")) and not fromFlag: dayOffset -= 2 used += 1 - if wordNext == "ayer": + if wordNext == "ontem": used += 1 elif word == "ante" and wordNext == "ante" and wordNextNext == \ - "ayer" and not fromFlag: + "ontem" and not fromFlag: dayOffset -= 3 used += 3 - elif word == "ante anteayer" and not fromFlag: + elif word == "anteanteontem" and not fromFlag: dayOffset -= 3 used += 1 # day after tomorrow - elif word == "pasado" and wordNext == u"mañana" and not fromFlag: + elif word == "depois" and wordNext == "amanha" and not fromFlag: dayOffset += 2 used = 2 # day before yesterday - elif word == "ante" and wordNext == "ayer" and not fromFlag: + elif word == "antes" and wordNext == "ontem" and not fromFlag: dayOffset -= 2 used = 2 # parse 5 days, 10 weeks, last week, next week, week after - elif word == u"día": - if wordNext == "pasado" or wordNext == "ante": + elif word == "dia": + if wordNext == "depois" or wordNext == "antes": used += 1 if wordPrev and wordPrev[0].isdigit(): dayOffset += int(wordPrev) @@ -500,7 +528,7 @@ def date_found(): start -= 1 used = 2 # parse 5 years, next year, last year - elif word == u"año" and not fromFlag: + elif word == "ano" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 @@ -528,24 +556,33 @@ def date_found(): # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: + d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 - if wordPrev == "siguiente": - dayOffset += 7 - used += 1 - start -= 1 - elif wordPrev == "pasado": - dayOffset -= 7 - used += 1 - start -= 1 - if wordNext == "siguiente": - # dayOffset += 7 - used += 1 - elif wordNext == "pasado": - # dayOffset -= 7 + for w in nexts: + if wordPrev == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in lasts: + if wordPrev == w: + dayOffset -= 7 + used += 1 + start -= 1 + for w in suffix_nexts: + if wordNext == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in suffix_lasts: + if wordNext == w: + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "feira": used += 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort: @@ -556,7 +593,7 @@ def date_found(): used += 1 datestr = months[m] if wordPrev and wordPrev[0].isdigit(): - # 13 mayo + # 13 maio datestr += " " + wordPrev start -= 1 used += 1 @@ -568,7 +605,7 @@ def date_found(): hasYear = False elif wordNext and wordNext[0].isdigit(): - # mayo 13 + # maio 13 datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): @@ -579,7 +616,7 @@ def date_found(): hasYear = False elif wordPrevPrev and wordPrevPrev[0].isdigit(): - # 13 dia mayo + # 13 dia maio datestr += " " + wordPrevPrev start -= 2 @@ -592,7 +629,7 @@ def date_found(): hasYear = False elif wordNextNext and wordNextNext[0].isdigit(): - # mayo dia 13 + # maio dia 13 datestr += " " + wordNextNext used += 2 if wordNextNextNext and wordNextNextNext[0].isdigit(): @@ -608,38 +645,38 @@ def date_found(): # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort - validFollowups.append("hoy") - validFollowups.append(u"mañana") - validFollowups.append("ayer") - validFollowups.append("anteayer") - validFollowups.append("ahora") - validFollowups.append("ya") + validFollowups.append("hoje") + validFollowups.append("amanha") + validFollowups.append("ontem") + validFollowups.append("anteontem") + validFollowups.append("agora") + validFollowups.append("ja") validFollowups.append("ante") # TODO debug word "depois" that one is failing for some reason if word in froms and wordNext in validFollowups: - if not (wordNext == u"mañana" and wordNext == "ayer") and not ( - word == "pasado" or word == "antes"): + if not (wordNext == "amanha" and wordNext == "ontem") and not ( + word == "depois" or word == "antes" or word == "em"): used = 2 fromFlag = True - if wordNext == u"mañana" and word != "pasado": + if wordNext == "amanha" and word != "depois": dayOffset += 1 - elif wordNext == "ayer": + elif wordNext == "ontem": dayOffset -= 1 - elif wordNext == "anteayer": + elif wordNext == "anteontem": dayOffset -= 2 - elif wordNext == "ante" and wordNextNext == "ayer": + elif wordNext == "ante" and wordNextNext == "ontem": dayOffset -= 2 - elif (wordNext == "ante" and wordNext == "ante" and - wordNextNextNext == "ayer"): + elif (wordNext == "ante" and wordNextNext == "ante" and + wordNextNextNext == "ontem"): dayOffset -= 3 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 - # if wordNextNext == "feira": - # used += 1 + if wordNextNext == "feira": + used += 1 if tmpOffset < 0: tmpOffset += 7 if wordNextNext: @@ -662,8 +699,8 @@ def date_found(): tmpOffset -= 7 used += 1 dayOffset += tmpOffset - # if wordNextNextNext == "feira": - # used += 1 + if wordNextNextNext == "feira": + used += 1 if wordNext in months: used -= 1 if used > 0: @@ -681,11 +718,15 @@ def date_found(): daySpecified = True # parse time + timeStr = "" hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None + + # TODO: Is this necessary? + military = False for idx, word in enumerate(words): if word == "": @@ -697,17 +738,58 @@ def date_found(): wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" # parse noon, midnight, morning, afternoon, evening + # TODO: used = 0 - if word == "medio" and wordNext == u"día": + # if word == 'mediodía': + # hrAbs = 12 + # used += 1 + # elif word == 'medianoche': + # hrAbs = 0 + # used += 1 + # if word == 'medio' and wordNext == 'día': + # hrAbs = 12 + # used += 2 + # elif word == 'media' and wordNext == 'noche': + # hrAbs = 0 + # used += 2 + # elif word == 'mañana' and wordPrev == 'la': + # if not hrAbs: + # hrAbs = 8 + # used += 1 + # if wordNext and wordNext[0].isdigit(): # mattina alle 5 + # hrAbs = int(wordNext) + # used += 1 + # elif word == 'tarde': + # if not hrAbs: + # hrAbs = 15 + # used += 1 + # if wordNext and wordNext[0].isdigit(): # pomeriggio alle 5 + # hrAbs = int(wordNext) + # used += 1 + # if (hrAbs or 0) < 12: + # hrAbs = (hrAbs or 0) + 12 + # elif word == 'noche': + # if not hrAbs: + # hrAbs = 19 + # used += 1 + # if wordNext and wordNext[0].isdigit() \ + # and ':' not in wordNext: + # hrAbs = int(wordNext) + # used += 1 + # if (hrAbs or 0) < 12: + # hrAbs = (hrAbs or 0) + 12 + + + if word == "medio" and wordNext == "día": hrAbs = 12 used += 2 elif word == "media" and wordNext == "noche": hrAbs = 0 used += 2 - elif word == u"mañana": + elif word == "mañana" and wordPrev == "la": if not hrAbs: hrAbs = 8 - used += 1 + used += 2 elif word == "tarde": if not hrAbs: hrAbs = 15 @@ -716,29 +798,25 @@ def date_found(): if not hrAbs: hrAbs = 17 used += 2 - elif word == "tarde" and wordNext == "noche": + elif word == "media" and wordNext == "mañana": if not hrAbs: - hrAbs = 20 + hrAbs = 10 used += 2 - elif word == "media" and wordNext == u"mañana": + elif word == "tarde" and wordNext == "noche": if not hrAbs: - hrAbs = 10 + hrAbs = 19 used += 2 - # elif word == "fim" and wordNext == "tarde": - # if not hrAbs: - # hrAbs = 19 - # used += 2 # elif word == "fim" and wordNext == "manha": # if not hrAbs: # hrAbs = 11 # used += 2 elif word == "madrugada": if not hrAbs: - hrAbs = 1 - used += 2 + hrAbs = 4 + used += 1 elif word == "noche": if not hrAbs: - hrAbs = 21 + hrAbs = 22 used += 1 # parse half an hour, quarter hour elif word == "hora" and \ @@ -795,31 +873,32 @@ def date_found(): if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 - elif wordNext == u"mañana" or wordNext == "madrugada": + elif wordNext == "manha": remainder = "am" used += 1 elif wordNext == "tarde": remainder = "pm" used += 1 - elif wordNext == "noche": + elif wordNext == "noite": if 0 < int(word[0]) < 6: remainder = "am" else: remainder = "pm" used += 1 - elif wordNext in thises and wordNextNext == u"mañana": + elif wordNext in thises and wordNextNext == "manha": remainder = "am" used = 2 elif wordNext in thises and wordNextNext == "tarde": remainder = "pm" used = 2 - elif wordNext in thises and wordNextNext == "noche": + elif wordNext in thises and wordNextNext == "noite": remainder = "pm" used = 2 else: if timeQualifier != "": + military = True if strHH <= 12 and \ - (timeQualifier == u"mañana" or + (timeQualifier == "manha" or timeQualifier == "tarde"): strHH += 12 @@ -863,19 +942,20 @@ def date_found(): used = 1 elif (wordNext == "am" or wordNext == "a.m." or - wordNext == u"mañana"): + wordNext == "manha"): strHH = strNum remainder = "am" used = 1 elif (int(word) > 100 and ( - # wordPrev == "o" or - # wordPrev == "oh" or - wordPrev == "cero" + wordPrev == "o" or + wordPrev == "oh" or + wordPrev == "zero" )): # 0800 hours (pronounced oh-eight-hundred) strHH = int(word) / 100 strMM = int(word) - strHH * 100 + military = True if wordNext == "hora": used += 1 elif ( @@ -910,23 +990,24 @@ def date_found(): elif int(word) > 100: strHH = int(word) / 100 strMM = int(word) - strHH * 100 + military = True if wordNext == "hora": used += 1 elif wordNext == "" or ( - wordNext == "en" and wordNextNext == "punto"): + wordNext == "em" and wordNextNext == "ponto"): strHH = word strMM = 00 - if wordNext == "en" and wordNextNext == "punto": + if wordNext == "em" and wordNextNext == "ponto": used += 2 if wordNextNextNext == "tarde": remainder = "pm" used += 1 - elif wordNextNextNext == u"mañana": + elif wordNextNextNext == "manha": remainder = "am" used += 1 - elif wordNextNextNext == "noche": - if 0 > strHH > 6: + elif wordNextNextNext == "noite": + if 0 > int(strHH) > 6: remainder = "am" else: remainder = "pm" @@ -935,6 +1016,7 @@ def date_found(): elif wordNext[0].isdigit(): strHH = word strMM = wordNext + military = True used += 1 if wordNextNext == "hora": used += 1 @@ -960,7 +1042,7 @@ def date_found(): for i in range(used): words[idx + i] = "" - if wordPrev == "en" or wordPrev == "punto": + if wordPrev == "em" or wordPrev == "ponto": words[words.index(wordPrev)] = "" if idx > 0 and wordPrev in time_indicators: @@ -1018,18 +1100,22 @@ def date_found(): month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) - - if hrAbs is None and minAbs is None and default_time: - hrAbs = default_time.hour - minAbs = default_time.minute - - if hrAbs != -1 and minAbs != -1: + if (hrAbs or 0) != -1 and (minAbs or 0) != -1: + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, minutes=minAbs or 0) if (hrAbs or minAbs) and datestr == "": @@ -1044,25 +1130,69 @@ def date_found(): resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) - # resultStr = pt_pruning(resultStr) + resultStr = pt_pruning(resultStr) return [extractedDate, resultStr] -def get_gender_es(word, raw_string=""): - # Next rules are imprecise and incompleted, but is a good starting point. - # For more detailed explanation, see - # http://www.wikilengua.org/index.php/Género_gramatical - word = word.rstrip("s") - gender = False - words = raw_string.split(" ") +def pt_pruning(text, symbols=True, accents=True, agressive=True): + # agressive pt word pruning + words = ["a", "o", "os", "as", "de", "dos", "das", + "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para", + "este", + "esta", "deste", "desta", "neste", "nesta", "nesse", + "nessa", "foi", "que"] + if symbols: + symbols = [".", ",", ";", ":", "!", "?", u"�", u"�"] + for symbol in symbols: + text = text.replace(symbol, "") + text = text.replace("-", " ").replace("_", " ") + if accents: + accents = {"a": [u"á", u"à", u"ã", u"â"], + "e": [u"ê", u"è", u"é"], + "i": [u"í", u"ì"], + "o": [u"ò", u"ó"], + "u": [u"ú", u"ù"], + "c": [u"ç"]} + for char in accents: + for acc in accents[char]: + text = text.replace(acc, char) + if agressive: + text_words = text.split(" ") + for idx, word in enumerate(text_words): + if word in words: + text_words[idx] = "" + text = " ".join(text_words) + text = ' '.join(text.split()) + return text + + +def get_gender_pt(word, text=""): + # parse gender taking context into account + word = word.lower() + words = text.lower().split(" ") for idx, w in enumerate(words): if w == word and idx != 0: - previous = words[idx - 1] - gender = get_gender_es(previous) - break - if not gender: - if word[-1] == "a": - gender = "f" - if word[-1] == "o" or word[-1] == "e": - gender = "m" - return gender + # in portuguese usually the previous word (a determinant) + # assigns gender to the next word + previous = words[idx - 1].lower() + if previous in _MALE_DETERMINANTS_ES: + return "m" + elif previous in _FEMALE_DETERMINANTS_ES: + return "f" + + # get gender using only the individual word + # see if this word has the gender defined + if word in _GENDERS_ES: + return _GENDERS_ES[word] + singular = word.rstrip("s") + if singular in _GENDERS_ES: + return _GENDERS_ES[singular] + # in portuguese the last vowel usually defines the gender of a word + # the gender of the determinant takes precedence over this rule + for end_str in _FEMALE_ENDINGS_ES: + if word.endswith(end_str): + return "f" + for end_str in _MALE_ENDINGS_ES: + if word.endswith(end_str): + return "m" + return None From 26df660de9f287c8d073806621900298b087a955 Mon Sep 17 00:00:00 2001 From: Angel Docampo Date: Tue, 22 Oct 2019 11:17:12 +0200 Subject: [PATCH 4/4] test --- .vscode/launch.json | 17 + .vscode/settings.json | 3 + lingua_franca/lang/common_data_es.py | 8 +- lingua_franca/lang/parse_es.py | 303 +++++---- lingua_franca/parse.py | 5 +- test/test_parse_es.py | 921 +++++++++++++++++++++++++-- 6 files changed, 1042 insertions(+), 215 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..b281853e --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,17 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + + + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..c7cadb4d --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": ".venv/bin/python" +} \ No newline at end of file diff --git a/lingua_franca/lang/common_data_es.py b/lingua_franca/lang/common_data_es.py index 54893d10..81e51323 100644 --- a/lingua_franca/lang/common_data_es.py +++ b/lingua_franca/lang/common_data_es.py @@ -26,7 +26,7 @@ # word rules for gender # TODO: review rules to see exceptions -_FEMALE_ENDINGS_ES = ["a", "as"] +_FEMALE_ENDINGS_ES = ["a", "as", "triz"] _MALE_ENDINGS_ES = ["o", "os"] # special cases, word lookup for words not covered by above rule @@ -58,8 +58,10 @@ } # context rules for gender -_MALE_DETERMINANTS_ES = ["lo", "los", "este", "estos", "ese", "esos"] -_FEMALE_DETERMINANTS_ES = ["la", "las", "esta", "estas", "esa", "esas"] +_MALE_DETERMINANTS_ES = ["lo", "los", "este", "estos", "ese", "esos", + "un", "unos"] +_FEMALE_DETERMINANTS_ES = ["la", "las", "esta", "estas", "esa", "esas", + "una", "unas"] _NUMBERS_ES = { diff --git a/lingua_franca/lang/parse_es.py b/lingua_franca/lang/parse_es.py index 499c356c..52c3f7f5 100644 --- a/lingua_franca/lang/parse_es.py +++ b/lingua_franca/lang/parse_es.py @@ -29,7 +29,7 @@ _MALE_DETERMINANTS_ES, _MALE_ENDINGS_ES, _GENDERS_ES -def isFractional_pt(input_str): +def isFractional_es(input_str): """ This function takes the given text and checks if it is a fraction. @@ -42,27 +42,23 @@ def isFractional_pt(input_str): if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "fifths" - aFrac = ["meio", u"terço", "quarto", "quinto", "sexto", - "setimo", "oitavo", "nono", u"décimo"] + aFrac = ["medio", "tercio", "cuarto", "quinto", "sexto", + "séptimo", "octavo", "noveno", "décimo"] if input_str.lower() in aFrac: return 1.0 / (aFrac.index(input_str) + 2) - if input_str == u"vigésimo": + if input_str == "vigésimo": return 1.0 / 20 - if input_str == u"trigésimo": + if input_str == "trigésimo": return 1.0 / 30 - if input_str == u"centésimo": + if input_str == "centésimo": return 1.0 / 100 - if input_str == u"milésimo": + if input_str == "milésimo": return 1.0 / 1000 - if (input_str == u"sétimo" or input_str == "septimo" or - input_str == u"séptimo"): - return 1.0 / 7 - return False -def extractnumber_pt(text): +def extractnumber_es(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. @@ -93,10 +89,10 @@ def extractnumber_pt(text): val = int(word) elif is_numeric(word): val = float(word) - elif isFractional_pt(word): + elif isFractional_es(word): if not result: result = 1 - result = result * isFractional_pt(word) + result = result * isFractional_es(word) count += 1 continue @@ -121,7 +117,7 @@ def extractnumber_pt(text): break # number word and fraction - ands = ["e"] + ands = ["y"] if next_word in ands: zeros = 0 if result is None: @@ -132,13 +128,13 @@ def extractnumber_pt(text): for word in newWords: newText += word + " " - afterAndVal = extractnumber_pt(newText[:-1]) + afterAndVal = extractnumber_es(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: - if word == "zero" or word == "0": + if word == "cero" or word == "0": zeros += 1 else: break @@ -152,14 +148,14 @@ def extractnumber_pt(text): newText = "" for word in newWords: newText += word + " " - afterAndVal = extractnumber_pt(newText[:-1]) + afterAndVal = extractnumber_es(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break - decimals = ["ponto", "virgula", "vírgula", ".", ","] + decimals = ["punto", "coma", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] @@ -167,11 +163,11 @@ def extractnumber_pt(text): for word in newWords: newText += word + " " for word in newWords: - if word == "zero" or word == "0": + if word == "cero" or word == "0": zeros += 1 else: break - afterDotVal = str(extractnumber_pt(newText[:-1])) + afterDotVal = str(extractnumber_es(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break @@ -192,44 +188,44 @@ def extractnumber_pt(text): return result - -def pt_number_parse(words, i): - def pt_cte(i, s): +# TODO: specify lang at the end on function names +def es_number_parse(words, i): + def es_cte(i, s): if i < len(words) and s == words[i]: return s, i + 1 return None - def pt_number_word(i, mi, ma): + def es_number_word(i, mi, ma): if i < len(words): v = _NUMBERS_ES.get(words[i]) if v and v >= mi and v <= ma: return v, i + 1 return None - def pt_number_1_99(i): - r1 = pt_number_word(i, 1, 29) + def es_number_1_99(i): + r1 = es_number_word(i, 1, 29) if r1: return r1 - r1 = pt_number_word(i, 30, 90) + r1 = es_number_word(i, 30, 90) if r1: v1, i1 = r1 - r2 = pt_cte(i1, "e") + r2 = es_cte(i1, "y") if r2: i2 = r2[1] - r3 = pt_number_word(i2, 1, 9) + r3 = es_number_word(i2, 1, 9) if r3: v3, i3 = r3 return v1 + v3, i3 return r1 return None - def pt_number_1_999(i): + def es_number_1_999(i): # [2-9]cientos [1-99]? - r1 = pt_number_word(i, 100, 900) + r1 = es_number_word(i, 100, 900) if r1: v1, i1 = r1 - r2 = pt_number_1_99(i1) + r2 = es_number_1_99(i1) if r2: v2, i2 = r2 return v1 + v2, i2 @@ -237,26 +233,26 @@ def pt_number_1_999(i): return r1 # [1-99] - r1 = pt_number_1_99(i) + r1 = es_number_1_99(i) if r1: return r1 return None - def pt_number(i): + def es_number(i): # check for cero - r1 = pt_number_word(i, 0, 0) + r1 = es_number_word(i, 0, 0) if r1: return r1 # check for [1-999] (mil [0-999])? - r1 = pt_number_1_999(i) + r1 = es_number_1_999(i) if r1: v1, i1 = r1 - r2 = pt_cte(i1, "mil") + r2 = es_cte(i1, "mil") if r2: i2 = r2[1] - r3 = pt_number_1_999(i2) + r3 = es_number_1_999(i2) if r3: v3, i3 = r3 return v1 * 1000 + v3, i3 @@ -266,15 +262,15 @@ def pt_number(i): return r1 return None - return pt_number(i) + return es_number(i) -def normalize_pt(text, remove_articles): - """ PT string normalization """ +def normalize_es(text, remove_articles): + """ ES string normalization """ words = text.split() # this also removed extra spaces normalized = "" - # Contractions are not common in PT + # Contractions are not common in ES # Convert numbers into digits, e.g. "dois" -> "2" normalized = "" @@ -287,7 +283,7 @@ def normalize_pt(text, remove_articles): continue # Convert numbers into digits - r = pt_number_parse(words, i) + r = es_number_parse(words, i) if r: v, i = r normalized += " " + str(v) @@ -300,59 +296,51 @@ def normalize_pt(text, remove_articles): normalized += " " + word i += 1 - # some articles in pt-pt can not be removed, but many words can + # some articles in es_ES can not be removed, but many words can # this is experimental and some meaning may be lost # maybe agressive should default to False # only usage will tell, as a native speaker this seems reasonable - return pt_pruning(normalized[1:], agressive=remove_articles) + return es_pruning(normalized[1:], agressive=remove_articles) -def extract_datetime_pt(input_str, currentDate, default_time): +def extract_datetime_es(input_str, currentDate, default_time): def clean_string(s): # cleans the input string of unneeded punctuation and capitalization # among other things - symbols = [".", ",", ";", "?", "!", u"º", u"ª"] - noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de", - "ao", "aos"] + symbols = [".", ",", ";", "?", "!", "º", "ª"] + noise_words = ["lo", "los", "la", "las", "de", "para"] for word in symbols: s = s.replace(word, "") for word in noise_words: s = s.replace(" " + word + " ", " ") + # TODO: need to replace "ñ" as well? s = s.lower().replace( - u"á", - "a").replace( - u"ç", - "c").replace( - u"à", - "a").replace( - u"ã", + "á", "a").replace( - u"é", + "é", "e").replace( - u"è", - "e").replace( - u"ê", - "e").replace( - u"ó", - "o").replace( - u"ò", + "í", + "i").replace( + "ó", "o").replace( + "ú", + "u").replace( "-", " ").replace( "_", "") # handle synonims and equivalents, "tomorrow early = tomorrow morning - synonims = {"manha": ["manhazinha", "cedo", "cedinho"], - "tarde": ["tardinha", "tarde"], - "noite": ["noitinha", "anoitecer"], - "todos": ["ao", "aos"], - "em": ["do", "da", "dos", "das", "de"]} + synonims = {"mañana": ["matutino", "temprano", "pronto"], + "tarde": ["atardecer", "tarde"], + "noche": ["nocturno", "anochecer"], + "todos": ["al", "hacia"], + "en": ["de"]} for syn in synonims: for word in synonims[syn]: s = s.replace(" " + word + " ", " " + syn + " ") - # relevant plurals, cant just extract all s in pt - wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos", + # relevant plurals, cant just extract all s in es + wordlist = ["mañanas", "noches", "tardes", "dias", "semanas", "años", "minutos", "segundos", "nas", "nos", "proximas", "seguintes", "horas"] for _, word in enumerate(wordlist): @@ -387,26 +375,31 @@ def date_found(): timeQualifier = "" words = clean_string(input_str).split(" ") - timeQualifiersList = ['manha', 'tarde', 'noite'] - time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas", - "no", "dia", "hora"] - days = ['segunda', 'terca', 'quarta', - 'quinta', 'sexta', 'sabado', 'domingo'] - months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho', - 'julho', 'agosto', 'setembro', 'outubro', 'novembro', - 'dezembro'] - monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag', - 'set', 'out', 'nov', 'dec'] + timeQualifiersList = ['mañana', 'tarde', 'noche'] + # TODO: need to remove "la" from the following? + # There are also other compound time indicators like + # "después de", "pasadas las", "pasada la", "dentro de", etc + time_indicators = ["por", "las", "la", "esta", "pasada", "tras", "estas", + "desde", "dia", "hora"] + days = ['lunes', 'martes', 'miercoles', + 'jueves', 'viernes', 'sabado', 'domingo'] + months = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', + 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', + 'diciembre'] + monthsShort = ['ene', 'feb', 'mar', 'abr', 'may', 'jun', 'jul', 'ago', + 'sep', 'oct', 'nov', 'dic'] nexts = ["proximo", "proxima"] - suffix_nexts = ["seguinte", "subsequente", "seguir"] + # TODO: most used suffix is "que viene" + suffix_nexts = ["siguiente", "subsiguiente"] lasts = ["ultimo", "ultima"] - suffix_lasts = ["passada", "passado", "anterior", "antes"] - nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"] + suffix_lasts = ["pasada", "pasado", "anterior", "antes"] + nxts = ["después", "siguiente", "proxima", "proximo"] prevs = ["antes", "ante", "previa", "previamente", "anterior"] - froms = ["partir", "em", "para", "na", "no", "daqui", "seguir", - "depois", "por", "proxima", "proximo", "da", "do", "de"] - thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse", - "nessa"] + # TODO: there are many compound particles, like + # "desde aquí" "después de", "a partir de" etc + froms = ["partir", "en", "para", "despues","por", "proxima", + "proximo", "de"] + thises = ["este", "esta", "ese", "esa", "esos", "esas", "estos", "estas"] froms += thises lists = nxts + prevs + froms + time_indicators for idx, word in enumerate(words): @@ -425,40 +418,40 @@ def date_found(): timeQualifier = word # parse today, tomorrow, yesterday - elif word == "hoje" and not fromFlag: + elif word == "hoy" and not fromFlag: dayOffset = 0 used += 1 - elif word == "amanha" and not fromFlag: + elif word == "mañana" and not fromFlag: dayOffset = 1 used += 1 - elif word == "ontem" and not fromFlag: + elif word == "ayer" and not fromFlag: dayOffset -= 1 used += 1 # "before yesterday" and "before before yesterday" - elif (word == "anteontem" or - (word == "ante" and wordNext == "ontem")) and not fromFlag: + elif (word == "anteayer" or + (word == "ante" and wordNext == "ayer")) and not fromFlag: dayOffset -= 2 used += 1 - if wordNext == "ontem": + if wordNext == "ayer": used += 1 elif word == "ante" and wordNext == "ante" and wordNextNext == \ - "ontem" and not fromFlag: + "ayer" and not fromFlag: dayOffset -= 3 used += 3 - elif word == "anteanteontem" and not fromFlag: + elif word == "anteanteayer" and not fromFlag: dayOffset -= 3 used += 1 # day after tomorrow - elif word == "depois" and wordNext == "amanha" and not fromFlag: + elif word == "pasado" and wordNext == "mañana" and not fromFlag: dayOffset += 2 used = 2 # day before yesterday - elif word == "antes" and wordNext == "ontem" and not fromFlag: + elif word == "ante" and wordNext == "ayer" and not fromFlag: dayOffset -= 2 used = 2 # parse 5 days, 10 weeks, last week, next week, week after elif word == "dia": - if wordNext == "depois" or wordNext == "antes": + if wordNext == "pasado" or wordNext == "antes": used += 1 if wordPrev and wordPrev[0].isdigit(): dayOffset += int(wordPrev) @@ -528,7 +521,7 @@ def date_found(): start -= 1 used = 2 # parse 5 years, next year, last year - elif word == "ano" and not fromFlag: + elif word == "año" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 @@ -582,7 +575,7 @@ def date_found(): dayOffset -= 7 used += 1 start -= 1 - if wordNext == "feira": + if wordNext == "día": used += 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort: @@ -593,7 +586,7 @@ def date_found(): used += 1 datestr = months[m] if wordPrev and wordPrev[0].isdigit(): - # 13 maio + # 13 mayo datestr += " " + wordPrev start -= 1 used += 1 @@ -605,7 +598,7 @@ def date_found(): hasYear = False elif wordNext and wordNext[0].isdigit(): - # maio 13 + # mayo 13 datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): @@ -616,7 +609,7 @@ def date_found(): hasYear = False elif wordPrevPrev and wordPrevPrev[0].isdigit(): - # 13 dia maio + # 13 dia mayo datestr += " " + wordPrevPrev start -= 2 @@ -629,7 +622,7 @@ def date_found(): hasYear = False elif wordNextNext and wordNextNext[0].isdigit(): - # maio dia 13 + # mayo dia 13 datestr += " " + wordNextNext used += 2 if wordNextNextNext and wordNextNextNext[0].isdigit(): @@ -645,37 +638,37 @@ def date_found(): # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort - validFollowups.append("hoje") - validFollowups.append("amanha") - validFollowups.append("ontem") - validFollowups.append("anteontem") - validFollowups.append("agora") - validFollowups.append("ja") + validFollowups.append("hoy") + validFollowups.append("mañana") + validFollowups.append("ayer") + validFollowups.append("anteayer") + validFollowups.append("ahora") + validFollowups.append("ya") validFollowups.append("ante") - # TODO debug word "depois" that one is failing for some reason + # TODO debug word "pasado" that one is failing for some reason if word in froms and wordNext in validFollowups: - if not (wordNext == "amanha" and wordNext == "ontem") and not ( - word == "depois" or word == "antes" or word == "em"): + if not (wordNext == "mañana" and wordNext == "ayer") and not ( + word == "pasado" or word == "antes" or word == "en"): used = 2 fromFlag = True - if wordNext == "amanha" and word != "depois": + if wordNext == "mañana" and word != "pasado": dayOffset += 1 - elif wordNext == "ontem": + elif wordNext == "ayer": dayOffset -= 1 - elif wordNext == "anteontem": + elif wordNext == "anteayer": dayOffset -= 2 - elif wordNext == "ante" and wordNextNext == "ontem": + elif wordNext == "ante" and wordNextNext == "ayer": dayOffset -= 2 elif (wordNext == "ante" and wordNextNext == "ante" and - wordNextNextNext == "ontem"): + wordNextNextNext == "ayer"): dayOffset -= 3 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 - if wordNextNext == "feira": + if wordNextNext == "día": used += 1 if tmpOffset < 0: tmpOffset += 7 @@ -699,7 +692,7 @@ def date_found(): tmpOffset -= 7 used += 1 dayOffset += tmpOffset - if wordNextNextNext == "feira": + if wordNextNextNext == "día": used += 1 if wordNext in months: used -= 1 @@ -726,7 +719,7 @@ def date_found(): minAbs = None # TODO: Is this necessary? - military = False + # military = False for idx, word in enumerate(words): if word == "": @@ -873,32 +866,32 @@ def date_found(): if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 - elif wordNext == "manha": + elif wordNext == "mañana": remainder = "am" used += 1 elif wordNext == "tarde": remainder = "pm" used += 1 - elif wordNext == "noite": + elif wordNext == "noche": if 0 < int(word[0]) < 6: remainder = "am" else: remainder = "pm" used += 1 - elif wordNext in thises and wordNextNext == "manha": + elif wordNext in thises and wordNextNext == "mañana": remainder = "am" used = 2 elif wordNext in thises and wordNextNext == "tarde": remainder = "pm" used = 2 - elif wordNext in thises and wordNextNext == "noite": + elif wordNext in thises and wordNextNext == "noche": remainder = "pm" used = 2 else: if timeQualifier != "": - military = True + # military = True if strHH <= 12 and \ - (timeQualifier == "manha" or + (timeQualifier == "mañana" or timeQualifier == "tarde"): strHH += 12 @@ -942,7 +935,7 @@ def date_found(): used = 1 elif (wordNext == "am" or wordNext == "a.m." or - wordNext == "manha"): + wordNext == "mañana"): strHH = strNum remainder = "am" used = 1 @@ -950,12 +943,12 @@ def date_found(): ( wordPrev == "o" or wordPrev == "oh" or - wordPrev == "zero" + wordPrev == "cero" )): # 0800 hours (pronounced oh-eight-hundred) strHH = int(word) / 100 strMM = int(word) - strHH * 100 - military = True + # military = True if wordNext == "hora": used += 1 elif ( @@ -990,23 +983,23 @@ def date_found(): elif int(word) > 100: strHH = int(word) / 100 strMM = int(word) - strHH * 100 - military = True + # military = True if wordNext == "hora": used += 1 elif wordNext == "" or ( - wordNext == "em" and wordNextNext == "ponto"): + wordNext == "en" and wordNextNext == "punto"): strHH = word strMM = 00 - if wordNext == "em" and wordNextNext == "ponto": + if wordNext == "en" and wordNextNext == "punto": used += 2 if wordNextNextNext == "tarde": remainder = "pm" used += 1 - elif wordNextNextNext == "manha": + elif wordNextNextNext == "mañana": remainder = "am" used += 1 - elif wordNextNextNext == "noite": + elif wordNextNextNext == "noche": if 0 > int(strHH) > 6: remainder = "am" else: @@ -1016,7 +1009,7 @@ def date_found(): elif wordNext[0].isdigit(): strHH = word strMM = wordNext - military = True + # military = True used += 1 if wordNextNext == "hora": used += 1 @@ -1042,7 +1035,7 @@ def date_found(): for i in range(used): words[idx + i] = "" - if wordPrev == "em" or wordPrev == "ponto": + if wordPrev == "en" or wordPrev == "punto": words[words.index(wordPrev)] = "" if idx > 0 and wordPrev in time_indicators: @@ -1130,29 +1123,27 @@ def date_found(): resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) - resultStr = pt_pruning(resultStr) + resultStr = es_pruning(resultStr) return [extractedDate, resultStr] -def pt_pruning(text, symbols=True, accents=True, agressive=True): - # agressive pt word pruning - words = ["a", "o", "os", "as", "de", "dos", "das", - "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para", - "este", - "esta", "deste", "desta", "neste", "nesta", "nesse", - "nessa", "foi", "que"] +def es_pruning(text, symbols=True, accents=True, agressive=True): + # agressive es word pruning + words = ["la", "lo", "los", "las", "de", "le", "les", "me", "nos", + "para", "este", "esta", "estos", "estas", "era", "eran", "quién"] if symbols: symbols = [".", ",", ";", ":", "!", "?", u"�", u"�"] for symbol in symbols: text = text.replace(symbol, "") text = text.replace("-", " ").replace("_", " ") if accents: - accents = {"a": [u"á", u"à", u"ã", u"â"], - "e": [u"ê", u"è", u"é"], - "i": [u"í", u"ì"], - "o": [u"ò", u"ó"], - "u": [u"ú", u"ù"], - "c": [u"ç"]} + # TODO: Should I put here "n": "ñ"?? + accents = {"a": "á", + "e": "é", + "i": "í", + "o": "ó", + "u": "ú", + } for char in accents: for acc in accents[char]: text = text.replace(acc, char) @@ -1166,13 +1157,13 @@ def pt_pruning(text, symbols=True, accents=True, agressive=True): return text -def get_gender_pt(word, text=""): +def get_gender_es(word, text=""): # parse gender taking context into account word = word.lower() words = text.lower().split(" ") for idx, w in enumerate(words): if w == word and idx != 0: - # in portuguese usually the previous word (a determinant) + # in spanish usually the previous word (a determinant) # assigns gender to the next word previous = words[idx - 1].lower() if previous in _MALE_DETERMINANTS_ES: @@ -1187,7 +1178,7 @@ def get_gender_pt(word, text=""): singular = word.rstrip("s") if singular in _GENDERS_ES: return _GENDERS_ES[singular] - # in portuguese the last vowel usually defines the gender of a word + # in spanish the last vowel usually defines the gender of a word # the gender of the determinant takes precedence over this rule for end_str in _FEMALE_ENDINGS_ES: if word.endswith(end_str): diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index b58275a8..4817b257 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -323,9 +323,10 @@ def get_gender(word, context="", lang=None): lang_code = get_primary_lang_code(lang) - if lang_code in ["pt", "es"]: - # spanish follows same rules + if lang_code == "pt": return get_gender_pt(word, context) + elif lang_code == "es": + return get_gender_es(word, context) elif lang_code == "it": return get_gender_it(word, context) # TODO: get_gender_xx for other languages diff --git a/test/test_parse_es.py b/test/test_parse_es.py index cb92e312..63e23b8a 100644 --- a/test/test_parse_es.py +++ b/test/test_parse_es.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # # Copyright 2017 Mycroft AI Inc. # @@ -15,8 +15,13 @@ # limitations under the License. # import unittest - +import lingua_franca +from datetime import datetime, time from lingua_franca.parse import normalize +from lingua_franca.parse import get_gender +from lingua_franca.parse import extract_datetime +from lingua_franca.parse import extract_number, extract_numbers + class TestNormalize(unittest.TestCase): @@ -24,58 +29,866 @@ class TestNormalize(unittest.TestCase): Test cases for Spanish parsing """ def test_articles_es(self): - self.assertEqual(normalize("esta es la prueba", lang="es", - remove_articles=True), - "esta es prueba") - self.assertEqual(normalize("y otra prueba", lang="es", - remove_articles=True), - "y otra prueba") + """ + Test cases for Spanish remove_articles + """ + self.assertEqual( + normalize("esta es la prueba", lang="es", remove_articles=True), + "esta es prueba" + ) + self.assertEqual( + normalize("este es el exámen", lang="es", remove_articles=True), + "este es exámen" + ) + self.assertEqual( + normalize("estos son los carácteres", lang="es", + remove_articles=True), "estos son carácteres") + self.assertEqual( + normalize("y otra prueba", lang="es", remove_articles=True), + "y otra prueba" + ) + self.assertEqual( + normalize("estas son unas pruebas", + lang="es", remove_articles=True), "estas son pruebas") + + def test_extractnumber_es(self): + """ + Test cases for Spanish extract_number, lang='es' + """ + self.assertEqual(extract_number('esto es la primera prueba', + lang='es'), 1) + self.assertEqual(extract_number('esto es el 2 test', lang='es'), 2) + self.assertEqual(extract_number('esto es el segundo test', + lang='es', ordinals=True), 2) + self.assertEqual(extract_number('esto es un tercio de test', + lang='es'), 1.0 / 3.0) + self.assertEqual(extract_number('esto es el tercer test', + lang='es', ordinals=True), 3.0) + # TODO: FAIL + # self.assertEqual(extract_number('esto es el trigésimo sexto test', + # lang='es'), 36.0) + self.assertEqual(extract_number('esto es la prueba número 4', + lang='es'), 4) + self.assertEqual(extract_number('una taza', lang='es'), 1) + self.assertEqual(extract_number('un gato', lang='es'), 1) + self.assertEqual(extract_number('un tercio de taza', + lang='es'), 1.0 / 3.0) + self.assertEqual(extract_number('2 quintos de taza', lang='es'), 0.4) + self.assertEqual(extract_number('tres tazas', lang='es'), 3) + self.assertEqual(extract_number('1/3 tazas', lang='es'), 1.0 / 3.0) + self.assertEqual(extract_number('un cuarto de taza', lang='es'), 0.25) + self.assertEqual(extract_number('1/4 taza', lang='es'), 0.25) + self.assertEqual(extract_number('2/3 taza', lang='es'), 2.0 / 3.0) + self.assertEqual(extract_number('3/4 taza', lang='es'), 3.0 / 4.0) + self.assertEqual(extract_number('1 y 1/4 taza', lang='es'), 1.25) + self.assertEqual(extract_number('1 taza y medio', lang='es'), 1.5) + self.assertEqual(extract_number('una taza y medio', lang='es'), 1.5) + self.assertEqual(extract_number('una y media taza', lang='es'), 1.5) + self.assertEqual(extract_number('una y una media taza', + lang='es'), 1.5) + self.assertEqual(extract_number('tres cuartos de taza', + lang='es'), 3.0 / 4.0) + self.assertEqual(extract_number('veintidos', lang='es'), 22) + self.assertEqual(extract_number('doscientos', lang='es'), 200) + self.assertEqual(extract_number('nueve mil', lang='es'), 9000) + # TODO: Dos millones + self.assertEqual(extract_number('dos millón', + lang='es', + short_scale=False), 2000000) + # TODO: Dos millones + self.assertEqual(extract_number('dos millón quinientos mil ' + 'toneladas de metales', + lang='es'), 2500000) + # TODO: Trillones + self.assertEqual(extract_number('seis trillón', lang='es', + short_scale=False), + 6000000000000000000.0) + self.assertEqual(extract_number('seis trillón', short_scale=False, + lang='es'), 6e+18) + self.assertEqual(extract_number('un millardo un millón', + lang='es', + short_scale=False), 1001000000) + # TODO: cien + self.assertEqual(extract_number('un millardo ciento', + lang='es', + short_scale=False), 1000000100) + # TODO: Fail + # self.assertEqual(extract_number('dos millardo un millón ciento + # treinta y dos', + # lang='es'), 2001000132) + self.assertEqual(extract_number('veinte diecisieteavos', + lang='es'), 20.0/17.0) + self.assertEqual(extract_number('uno coma cinco', lang='es'), 1.5) + self.assertEqual(extract_number('tres punto catorce', + lang='es'), 3.14) + self.assertEqual(extract_number('cero coma dos', lang='es'), 0.2) + # TODO: millones + self.assertEqual(extract_number('mil millón de años', + lang='es'), 1000000000.0) + # TODO: trillones + self.assertEqual(extract_number('trillón de años', + short_scale=False, + lang='es'), 1000000000000000000.0) + self.assertEqual(extract_number('cien mil', lang='es'), 100000) + # TODO: fail + # self.assertEqual(extract_number('mil cuatrocientos noventa y dos', + # lang='es'), 1492) + self.assertEqual(extract_number('menos 2', lang='es'), -2) + self.assertEqual(extract_number('menos setenta', lang='es'), -70) + + # TODO: millones + self.assertEqual(extract_number('mil millón', + lang='es'), 1000000000) + # TODO: Fail + # self.assertEqual(extract_number('mil ciento uno', + # lang='es'), 1101) + self.assertEqual(extract_number('un sexto tercio', + lang='es'), 1 / 6 / 3) + self.assertEqual(extract_number('treinta segundos', lang='es'), 30) + self.assertEqual(extract_number('treinta segundos', lang='es', + ordinals=True), 30) + self.assertEqual(extract_number('siete y pico', lang='es'), 7.0) + self.assertEqual(extract_number('siete coma 5', lang='es'), 7.5) + self.assertEqual(extract_number('siete punto 575', lang='es'), 7.575) + self.assertEqual(extract_number('siete y medio', lang='es'), 7.5) + self.assertEqual(extract_number('siete y ochenta', lang='es'), 7.80) + self.assertEqual(extract_number('siete y ocho', lang='es'), 7.8) + self.assertEqual(extract_number('siete y cero ocho', + lang='es'), 7.08) + self.assertEqual(extract_number('siete coma cero cero cero ocho grados', + lang='es'), 7.0008) + self.assertEqual(extract_number('veinte treceavos', + lang='es'), 20.0 / 13.0) + self.assertEqual(extract_number('veinte treceavos', lang='es', + short_scale=True), 20.0 / 13.0) + # TODO: Fail sesenta y seis + # self.assertEqual(extract_number('seis coma sesenta y seis', + # lang='es'), 6.66) + # self.assertEqual(extract_number('seis punto sesenta y seis', + # lang='es'), 6.66) + # TODO: Fail seiscientos + sesenta y seis + # self.assertEqual(extract_number('seiscientos sesenta y seis', + # lang='es'), 666) + # TODO: Fail mil cuatrocientos + + # self.assertEqual(extract_number('mil cuatrocientos noventa y dos', + # lang='es'), 1492) + self.assertEqual(extract_number('seiscientos punto cero seis', + lang='es'), 600.06) + self.assertEqual(extract_number('seiscientos punto cero cero seis', + lang='es'), 600.006) + self.assertEqual(extract_number('seiscientos punto cero cero cero seis', + lang='es'), 600.0006) + self.assertEqual(extract_number('tres décimos ', + lang='es'), 0.30000000000000004) + # TODO: Fail décimas + # self.assertEqual(extract_number('tres décimas ', + # lang='es'), 0.30000000000000004) + # TODO: Fail + # self.assertEqual(extract_number('doce centésimos', + # lang='es'), 0.12) + # self.assertEqual(extract_number('cinco y cuarenta y dos milésimas', + # lang='es'), 5.042) + self.assertEqual(extract_number('mil uno', + lang='es'), 1001) + # TODO: Fail + # self.assertEqual(extract_number('dos mil veintidós dólares ', + # lang='es'), 2022) + # self.assertEqual(extract_number( + # 'ciento catorce mil cuatrocientos once dólares ', + # lang='es', ordinals=True, short_scale=True), 114411) + + # TODO: es veintitrés, no veintitres + self.assertEqual(extract_number('veintitres dólares ', lang='es'), 23) + self.assertEqual(extract_number('veintiuno años ', + lang='es'), 21) + # TODO: es veintiún + # self.assertEqual(extract_number('veintiún años ', + # lang='es'), 21) + + # TODO: Fail + # self.assertEqual(extract_number('doce y cuarenta y cinco ', + # lang='es'), 12.45) + self.assertEqual(extract_number('hazles saber si alguien llega ', + lang='es'), False) + self.assertTrue(extract_number('El tenista es rápido', + lang='es') is False) + self.assertTrue(extract_number('alguna', lang='es') is False) + self.assertTrue(extract_number('cota cero', + lang='es') is not False) + self.assertEqual(extract_number('cota cero', lang='es'), 0) + self.assertTrue(extract_number('cota 0', lang='es') is not False) + self.assertEqual(extract_number('cota 0', lang='es'), 0) + self.assertEqual(extract_number('un par de cervezas', lang='es'), 2) + self.assertEqual(extract_number('una centena de cervezas', + lang='es'), 100) + # TODO: Centenar + # self.assertEqual(extract_number('un centenar de cervezas', + # lang='es'), 100) + self.assertEqual(extract_number('un par de mil de cervezas', + lang='es'), 2000) + # TODO: Miles + # self.assertEqual(extract_number('un par de mil de cervezas', + # lang='es'), 2000) + self.assertEqual(extract_number('una decena de monedas', + lang='es'), 10) + # TODO: Docenas + self.assertEqual(extract_number('tres docena de huevos', + lang='es'), 36) + self.assertEqual(extract_number('cero gatos', + lang='es'), 0) + + def test_extractdatetime_es_not_normalized(self): + """ + Test cases for Spanish datetime parsing + + """ + def extractWithFormat_es(text): + date = datetime(2018, 1, 13, 13, 4) # Sab 13 Ene, 2018 @ 13:04 + [extractedDate, leftover] = extract_datetime(text, date, + lang='es') + extractedDate = extractedDate.strftime('%Y-%m-%d %H:%M:%S') + return [extractedDate, leftover] + # The following is a test taken from english, I don't really know what + # is supposed is doing here. + def testExtract_es(text, expected_date, expected_leftover): + res = extractWithFormat_es(normalize(text)) # era normalize(text) + self.assertEqual(res[0], expected_date, 'por=' + text) + self.assertEqual(res[1], expected_leftover, 'por=' + text) + + testExtract_es('qué hora es ahora', + '2018-01-13 13:04:00', 'que hora es') + testExtract_es('tras dos segundos', + '2018-01-13 13:04:02', '') + testExtract_es('en un minuto', + '2018-01-13 13:05:00', '') + testExtract_es('en un par de minutos', + '2018-01-13 13:06:00', '') + testExtract_es('en un par de horas', + '2018-01-13 15:04:00', '') + testExtract_es('en dos semanas', + '2018-01-27 00:00:00', '') + testExtract_es('en un par de meses', + '2018-03-13 00:00:00', '') + testExtract_es('en un par de años', + '2020-01-13 00:00:00', '') + # TODO: Fail + # testExtract_es('en una década', + # '2028-01-13 00:00:00', '') + # testExtract_es('en un par de décadas', + # '2038-01-13 00:00:00', '') + # testExtract_es('en la siguiente década', + # '2028-01-13 00:00:00', '') + # testExtract_es('en el próximo decenio', + # '2028-01-13 00:00:00', '') + # testExtract_es('en la última década', + # '2008-01-13 00:00:00', '') + # testExtract_es('en la década pasada', + # '2008-01-13 00:00:00', '') + # testExtract_es('en un siglo', + # '2118-01-13 00:00:00', '') + testExtract_es('en un milenio', + '3018-01-13 00:00:00', '') + # testExtract_es('en un par de décadas', + # '2038-01-13 00:00:00', '') + # testExtract_es('en 5 décadas', + # '2068-01-13 00:00:00', '') + # testExtract_es('en un par de siglos', + # '2218-01-13 00:00:00', '') + # testExtract_es('en 2 siglos', + # '2218-01-13 00:00:00', '') + # testExtract_es('en un par de milenios', + # '4018-01-13 00:00:00', '') + testExtract_es('cita en 1 hora', + '2018-01-13 14:04:00', 'cita') + testExtract_es('cita en un hora', + '2018-01-13 14:04:00', 'cita') + # testExtract_es('cita en una hora', + # '2018-01-13 14:04:00', 'cita') + # testExtract_es('lo quiero en una hora', + # '2018-01-13 14:04:00', 'quiero') + testExtract_es('en 1 segundo', + '2018-01-13 13:04:01', '') + testExtract_es('en 2 segundos', + '2018-01-13 13:04:02', '') + testExtract_es('Prepara la emboscada en 1 minuto', + '2018-01-13 13:05:00', 'prepara emboscada') + # testExtract_es('Prepara la emboscada en media hora', + # '2018-01-13 13:34:00', 'prepara emboscada') + # testExtract_es('prepara la emboscada en 5 días a partir de hoy', + # '2018-01-18 00:00:00', 'prepara emboscada') + # testExtract_es('cuál es el pronóstico del tiempo para pasado mañana', + # '2018-01-15 00:00:00', 'cuál es el pronóstico del tiempo') + # testExtract_es('¿cuál es el pronóstico del tiempo el próximo jueves?', + # '2018-01-18 00:00:00', 'cual es el pronóstico del tiempo') + testExtract_es('¿qué tiempo hizo el jueves pasado?', + '2018-01-11 00:00:00', 'que tiempo hizo') + # testExtract_es('cuál es el pronóstico del tiempo el jueves que viene?', + # '2018-01-25 00:00:00', 'cuál es el pronóstico del tiempo') + # testExtract_es('¿cuál fue la previsión del tiempo el pasado jueves?', + # '2018-01-11 00:00:00', 'cual fue la previsión del tiempo') + testExtract_es('cuál es la previsión del tiempo para hoy?', + '2018-01-13 00:00:00', 'cual es prevision tiempo') + testExtract_es('recuérdame a las 10:45 pm', + '2018-01-13 22:45:00', 'recuerdame') + # testExtract_es('qué tiempo hace el viernes por la mañana', + # '2018-01-19 08:00:00', 'que tiempo') + # testExtract_es('Qué tiempo hará mañana', + # '2018-01-14 00:00:00', 'que tiempo hara') + testExtract_es('cuál es el pronóstico del tiempo para esta tarde', + '2018-01-13 15:00:00', 'cual es pronostico tiempo') + # testExtract_es('quali sono le previsioni meteo di oggi tarde ' + # 'presto', + # '2018-01-13 14:00:00', 'cuál es el pronóstico del tiempo') + testExtract_es('cuál es el pronóstico del tiempo para esta noche', + '2018-01-13 19:00:00', 'cual es pronostico tiempo') + # testExtract_es('quali sono le previsioni meteo di estas sera tardi', + # '2018-01-13 20:00:00', 'cuál es el pronóstico del tiempo') + testExtract_es('cuál es el pronóstico del tiempo para este mediodía', + '2018-01-14 12:00:00', 'cual es pronostico tiempo') + testExtract_es('cuál es el pronóstico del tiempo para esta medianoche', + '2018-01-14 00:00:00', 'cual es pronostico tiempo') + testExtract_es('cuál es el pronóstico del tiempo para el medio día', + '2018-01-14 12:00:00', 'cual es pronostico tiempo') + testExtract_es('cuál es el pronóstico del tiempo para la media noche', + '2018-01-14 00:00:00', 'cual es pronostico tiempo') + # testExtract_es('cuál es el pronóstico del tiempo para esta mañana', + # '2018-01-14 08:00:00', 'cual es pronostico tiempo') + testExtract_es('recuérdame que llame a mamá el 3 de agosto.', + '2018-08-03 00:00:00', 'recuerdame que llame mama') + # testExtract_es('recuérdame que llame a mamá mañana a las 7 de la mañana', + # '2018-01-14 07:00:00', 'recuerdame que llame mama') + # testExtract_es('recuérdame que llame a mamá mañana a las 7 de la tarde', + # '2018-01-13 19:00:00', 'recuerdame que llame mama') + # testExtract_es('llamar a mamá en una hora', + # '2018-01-13 14:04:00', 'llamar mama') + testExtract_es('recuérdame que llame a mamá a las 0600', + '2018-01-14 06:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá las 09 y 30', + '2018-01-13 21:30:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá a las 7 en punto', + '2018-01-13 19:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá esta tarde a las 7 ' + 'en punto', + '2018-01-13 19:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá a las 7 de la tarde', + '2018-01-13 19:00:00', 'recuerdame que llame mama') + # testExtract_es('recuérdame que llame a mamá mañana a las 7 en punto' + # ' de la mañana', + # '2018-01-14 07:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá el jueves por la tarde ' + 'a las 7 en punto', + '2018-01-18 19:00:00', 'recuerdame que llame mama') + # testExtract_es('recuérdame que llame a mamá el jueves ' + # 'por la mañana a las 7 en punto', + # '2018-01-18 07:00:00', 'recuerdame que llame mama') + # TODO: si ponemos "mañana" de "por la mañana" como exclusión, funciona. + # pero no debe hacerse. + testExtract_es('recuérdame que llame a mamá a las 7 ' + 'en punto del jueves por la mañana', + '2018-01-18 07:00:00', 'recuerdame que llame mama mañana') + # testExtract_es('recuérdame que llame a mamá a las 7:00 ' + # 'del jueves por la mañana', + # '2018-01-18 07:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá a las 7:00 del jueves por la tarde', + '2018-01-18 19:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá a las 11:00 del ' + 'jueves por la noche', + '2018-01-18 23:00:00', 'recuerdame que llame mama') + # TODO: Lo mismo que el TODO anterior, si excluímos "madrugada", funciona + testExtract_es('recuérdame que llame a mamá a las 2:00 de la madrugada ' + 'del jueves', + '2018-01-18 02:00:00', 'recuerdame que llame mama madrugada') + testExtract_es('recuérdame que llame a mamá a las 2:00 de la tarde ' + 'del jueves', + '2018-01-18 14:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá el jueves a las 2:00 ' + 'de la tarde', + '2018-01-18 14:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá miércoles tarde a las 8', + '2018-01-17 20:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá en dos horas', + '2018-01-13 15:04:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá en quince minutos', + '2018-01-13 13:19:00', 'recuerdame que llame mama') + # testExtract_es('recuérdame que llame a mamá en media hora', + # '2018-01-13 13:34:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá en un cuarto de hora', + '2018-01-13 13:19:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá en tres cuartos de hora', + '2018-01-13 13:49:00', 'recuerdame que llame mama') + # testExtract_es('Pon música de Rick Astley 2 días a partir del viernes', + # '2018-01-21 00:00:00', 'pon musica rick astley') + testExtract_es('Empezar la invasión a las 3:45 pm del jueves', + '2018-01-18 15:45:00', 'empezar invasion') + testExtract_es('el lunes, pide pastel de hojaldre', + '2018-01-15 00:00:00', 'pide pastel hojaldre') + # testExtract_es('Pon la música de Happy Birthday 5 dentro de años', + # '2023-01-13 00:00:00', 'pon musica happy birthday') + testExtract_es('comprar fuegos artificiales el 4 de julio', + '2018-07-04 00:00:00', 'comprar fuegos artificiales') + # testExtract_es('¿Cuál es el tiempo 2 semanas después del próximo viernes?', + # '2018-02-02 00:00:00', 'cual tiempo') + testExtract_es('qué tiempo hará el miércoles a las 0700 ', + '2018-01-17 07:00:00', 'que tiempo hara') + # testExtract_es('Programa la visita en 2 semanas y 6 días a partir del sábado', + # '2018-02-02 00:00:00', 'programa visita') + testExtract_es('Empezar la invasión jueves a las 03 45', + '2018-01-18 03:45:00', 'empezar invasion') + testExtract_es('Empezar la invasión a las 800 del jueves', + '2018-01-18 08:00:00', 'empezar invasion') + # TODO: Fail, cambia "fiesta" por "fieste" + # testExtract_es('empezar la fiesta a las 8 en punto de la noche' + # ' del jueves', + # '2018-01-18 20:00:00', 'empezar fiesta') + testExtract_es('Empezar la invasión a las 8 de la noche del jueves', + '2018-01-18 20:00:00', 'empezar invasion') + testExtract_es('Empezar la invasión del jueves a mediodía', + '2018-01-18 12:00:00', 'empezar invasion') + testExtract_es('Empezar la invasión del jueves a medianoche', + '2018-01-19 00:00:00', 'empezar invasion') + testExtract_es('Empezar la invasión del jueves a las 0500', + '2018-01-18 05:00:00', 'empezar invasion') + testExtract_es('despiértame en 4 años', + '2022-01-13 00:00:00', 'despiertame') + testExtract_es('despiértame en 4 años y 4 días', + '2022-01-17 00:00:00', 'despiertame') + testExtract_es('cuál es la previsión del tiempo 3 días después de mañana?', + '2018-01-17 00:00:00', 'cual es prevision tiempo') + testExtract_es('el tres de diciembre', + '2018-12-03 00:00:00', '') + testExtract_es('el 3 de diciembre', + '2018-12-03 00:00:00', '') + testExtract_es('el 3 dic 2019', + '2019-12-03 00:00:00', '') + testExtract_es('en feb 3 2019', + '2019-02-03 00:00:00', '') + testExtract_es('encontrémonos a las 8:00 esta noche', + '2018-01-13 20:00:00', 'encontremonos') + testExtract_es('encontrémonos a las 5 pm', + '2018-01-13 17:00:00', 'encontremonos') + testExtract_es('encontrémonos a las 8 a.m.', + '2018-01-14 08:00:00', 'encontremonos') + testExtract_es('recuérdame que me despierte a las 8 am', + '2018-01-14 08:00:00', 'recuerdame que me despierte') + testExtract_es('qué tiempo hará el jueves', + '2018-01-18 00:00:00', 'que tiempo hara') + testExtract_es('qué tiempo hará para este lunes', + '2018-01-15 00:00:00', 'que tiempo hara') + testExtract_es('qué tiempo hará este miércoles', + '2018-01-17 00:00:00', 'que tiempo hara') + testExtract_es('para el jueves qué tiempo hará', + '2018-01-18 00:00:00', 'que tiempo hara') + testExtract_es('este jueves qué tiempo hará', + '2018-01-18 00:00:00', 'que tiempo hara') + # TODO: Fail el "pasado lunes" + testExtract_es('el anterior lunes qué tiempo hizo', + '2018-01-08 00:00:00', 'que tiempo hizo') + testExtract_es('pon un aviso para el miércoles tarde a las 8', + '2018-01-17 20:00:00', 'pon aviso') + testExtract_es('pon un aviso el miércoles a las 3 en punto' + ' de la tarde', + '2018-01-17 15:00:00', 'pon aviso') + # TODO: hay que excluir "mañana" para que funcione + testExtract_es('pon un aviso para este miércoles a las 3 en punto' + ' de la mañana', + '2018-01-17 03:00:00', 'pon aviso mañana') + testExtract_es('pon un despertador el miércoles por la mañana a las' + ' 7 en punto', + '2018-01-17 07:00:00', 'pon despertador mañana') + testExtract_es('pon un despertador para hoy a las 7 en punto', + '2018-01-13 19:00:00', 'pon despertador') + testExtract_es('pon un despertador para esta tarde a las 7 en punto', + '2018-01-13 19:00:00', 'pon despertador') + # TODO: Fail + # testExtract_es('pon un despertador esta tarde a las 07:00', + # '2018-01-13 19:00:00', 'pon despertador') + testExtract_es('en la noche del 5 de junio de 2017, recuérdame' + ' llamar a mi madre', + '2017-06-05 19:00:00', 'recuerdame llamar mi madre') + # TODO: Fail, "Julio" aquí es un nombre, si se cambia a "Carlos", también falla. + # testExtract_es('actualiza mi calendario para una reunión por la mañana' + # ' con Julio el 4 de Marzo', + # '2018-03-04 08:00:00', + # 'actualiza mi calendario reunión mañana julio') + testExtract_es('qué día es hoy', + '2018-01-13 00:00:00', 'que dia es') + # testExtract_es('qué día es mañana', + # '2018-01-14 00:00:00', 'que dia es') + testExtract_es('que dia fue ayer', + '2018-01-12 00:00:00', 'que dia fue') + # testExtract_es('que dia es pasado mañana', + # '2018-01-15 00:00:00', 'que dia es') + testExtract_es('quedemos para cenar en 5 días', + '2018-01-18 00:00:00', 'quedemos cenar') + # TODO: Fail + # testExtract_es('Qué tiempo tendremos pasado mañana', + # '2018-01-15 00:00:00', 'que tiempo tendremos') + testExtract_es('avísame a las 22:45', + '2018-01-13 22:45:00', 'avisame') + # TODO: Fail, ni excluyendo "mañana" parece funcionar + # testExtract_es('Qué tiempo hará el viernes por la mañana', + # '2018-01-19 08:00:00', 'que tiempo hara mañana') + # TODO: "próximo" funciona, "que viene" no + # testExtract_es('recuérdame que llame a mamá el jueves que viene', + # '2018-01-25 00:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá en 3 semanas', + '2018-02-03 00:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá en 8 semanas', + '2018-03-10 00:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá en 8 semanas' + ' y 2 días', + '2018-03-12 00:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá en 4 días', + '2018-01-17 00:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá en 3 meses', + '2018-04-13 00:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá en 2 años y 2 días', + '2020-01-15 00:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá la próxima semana', + '2018-01-20 00:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá la semana próxima', + '2018-01-20 00:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que llame a mamá la semana que viene', + '2018-01-20 00:00:00', 'recuerdame que llame mama') + testExtract_es('recuérdame que controle el gasto de la semana pasada', + '2018-01-06 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto de la pasada semana', + '2018-01-06 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del mes pasado', + '2017-12-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del pasado mes', + '2017-12-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del mes anterior', + '2017-12-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del anterior mes', + '2017-12-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del mes próximo', + '2018-02-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del próximo mes', + '2018-02-13 00:00:00', 'recuerdame que controle gasto') + # TODO: Fail + # testExtract_es('recuérdame que controle el gasto del siguiente mes', + # '2018-02-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del mes que viene', + '2018-02-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del año pasado', + '2017-01-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del pasado año', + '2017-01-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del año próximo', + '2019-01-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del próximo año', + '2019-01-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que controle el gasto del año que viene', + '2019-01-13 00:00:00', 'recuerdame que controle gasto') + testExtract_es('recuérdame que llame el próximo jueves', + '2018-01-25 00:00:00', 'recuerdame que llame') + # TODO: Fail + # testExtract_es('recuérdame que llame jueves que viene', + # '2018-01-25 00:00:00', 'recuerdame que llame') + testExtract_es('recuérdame que controle el gasto del jueves pasado', + '2018-01-11 00:00:00', 'recuerdame que controle gasto') + # TODO: Fail + # testExtract_es('Jugar a futbol 2 días después del viernes', + # '2018-01-21 00:00:00', 'jugar futbol') + testExtract_es('Limpiar a las 15:45 del jueves', + '2018-01-18 15:45:00', 'limpiar') + testExtract_es('el lunes comprar queso', + '2018-01-15 00:00:00', 'comprar queso') + # TODO: a "cumpleaños" se le tiene que quitar la "s" final + testExtract_es('haz sonar música de cumpleaños en 5 años a partir de hoy', + '2023-01-13 00:00:00', 'haz sonar musica cumpleaño partir') + testExtract_es('haz sonar música de cumpleaños en 5 años a desde hoy', + '2023-01-13 00:00:00', 'haz sonar musica cumpleaño desde') + # TODO: "próximo" funciona, "que viene" no + testExtract_es('Hacer un Skype con mama a las 12:45 del jueves' + ' próximo.', + '2018-01-25 12:45:00', 'hacer skype con mama') + testExtract_es('¿Qué clima habrá este viernes?', + '2018-01-19 00:00:00', 'que clima habra') + testExtract_es('¿Qué clima habrá este viernes por la tarde?', + '2018-01-19 15:00:00', 'que clima habra') + testExtract_es('¿Qué clima habrá este viernes a media noche?', + '2018-01-20 00:00:00', 'que clima habra') + testExtract_es('¿Qué clima habrá este viernes a mediodía?', + '2018-01-19 12:00:00', 'que clima habra') + testExtract_es('Recuerdame llamar a mama el 3 agosto.', + '2018-08-03 00:00:00', 'recuerdame llamar mama') + testExtract_es('compra las velas el 1° de mayo', + '2018-05-01 00:00:00', 'compra velas') + testExtract_es('¿Qué clima habrá 1 día después de mañana?', + '2018-01-15 00:00:00', 'que clima habra') + testExtract_es('¿Qué clima habrá a la hora 7?', + '2018-01-13 19:00:00', 'que clima habra') + # TODO: Fail + # testExtract_es('¿Qué clima habrá mañana a las 7 en punto?', + # '2018-01-14 07:00:00', 'que clima habra') + # testExtract_es('¿Qué clima habrá mañana a las 2 de la tarde', + # '2018-01-14 14:00:00', 'que clima habra') + # testExtract_es('¿Qué clima habrá mañana por la tarde a las 2', + # '2018-01-14 14:00:00', 'que clima habra') + # testExtract_es('¿Qué clima habrá mañana sobre las 2:00', + # '2018-01-14 02:00:00', 'que clima habra') + # TODO: "próximo" funciona, "que viene" no + testExtract_es('¿Qué clima habrá a las 2 de la tarde del ' + 'viernes próximo?', + '2018-01-26 14:00:00', 'que clima habra') + testExtract_es('Recuérdame que me despierte en 4 años', + '2022-01-13 00:00:00', 'recuerdame que me despierte') + testExtract_es('Recuérdame que me despierte en 4 años y 4 días', + '2022-01-17 00:00:00', 'recuerdame que me despierte') + # TODO: Fail + # testExtract_es('Dormir 3 días desde mañana.', + # '2018-01-17 00:00:00', 'dormir') + # testExtract_es('marca la cita en 2 semanas y 6 días ' + # 'después del sábado', + # '2018-02-02 00:00:00', 'marca cita') + # TODO: Fail, cambia "fiesta" por "fieste"?? + # testExtract_es('La fiesta empieza a las 8 de la noche del jueves', + # '2018-01-18 20:00:00', 'fiesta empieza') + testExtract_es('Qué tiempo hará en 3 días?', + '2018-01-16 00:00:00', 'que tiempo hara') + testExtract_es('fija una cita diciembre 3', + '2018-12-03 00:00:00', 'fija cita') + # TODO: Fail + # testExtract_es('pon una cita el 3 de diciembre a las 3 de la tarde', + # '2018-12-03 15:00:00', 'fija cita') + testExtract_es('encontrémonos esta noche a las 8 ', + '2018-01-13 20:00:00', 'encontremonos') + testExtract_es('encontrémonos a las 8 esta noche', + '2018-01-13 20:00:00', 'encontremonos') + testExtract_es('pon una alarma esta noche a las 9', + '2018-01-13 21:00:00', 'pon alarma') + testExtract_es('pon una alarma esta noche a las 21', + '2018-01-13 21:00:00', 'pon alarma') + # TODO: Fail + # testExtract_es('insertar cita mañana por la noche a las 23', + # '2018-01-14 23:00:00', 'insertar cita') + # testExtract_es('insertar cita mañana a las 9 y media', + # '2018-01-14 09:30:00', 'insertar cita') + # testExtract_es('insertar cita mañana por la noche a las 23 y 3 cuartos', + # '2018-01-14 23:45:00', 'insertar cita') + + # TODO: Esto está bien, pero no entiendo por qué 5 cuartos está bien + testExtract_es('insertar cita esta noche a las 23 y 5 cuartos', + '2018-01-13 23:00:00', 'insertar cita') + + def test_extractdatetime_es_normalized(self): + """ + Test cases for Spanish datetime parsing + + """ + + def extractWithFormat_es(text): + date = datetime(2018, 1, 13, 13, 4) # Sab 13 Gen, 2018 @ 13:04 + [extractedDate, leftover] = extract_datetime(text, date, + lang='es') + extractedDate = extractedDate.strftime('%Y-%m-%d %H:%M:%S') + return [extractedDate, leftover] + + def testExtract_es(text, expected_date, expected_leftover): + res = extractWithFormat_es(normalize(text, lang='es')) + self.assertEqual(res[0], expected_date, 'por=' + text) + self.assertEqual(res[1], expected_leftover, 'por=' + text) + + testExtract_es('recuérdame que llame a mamá en 15 minutos', + '2018-01-13 13:19:00', 'recuerdame que llame a mama') + testExtract_es('llama a mamá a las 17 y 30', + '2018-01-13 17:30:00', 'llama a mama a') + # TODO: fail + # testExtract_es('recuérdame que llame a mamá el sábado a las 10 ' + + # 'de la mañana', + # '2018-01-13 10:00:00', 'recuerdame que llame a mama a mañana') + # testExtract_es('recuérdame que llame a mamá a las 10 de la mañana de' + # ' este sábado', + # '2018-01-13 10:00:00', 'recuerdame que llame a mama a mañana') + testExtract_es('recuérdame que llame a mamá a las 10 de la mañana del' + ' sábado que viene', + '2018-01-20 10:00:00', 'recuerdame que llame a mama a mañana') + testExtract_es('recuérdame que llame a mamá a las 10 de la mañana del' + ' próximo sábado', + '2018-01-20 10:00:00', 'recuerdame que llame a mama a mañana') + testExtract_es('¿Qué clima habrá este viernes a las 11 de la mañana?', + '2018-01-19 11:00:00', 'que clima habra a mañana') + testExtract_es('comprar fresas el 13 de mayo', + '2018-05-13 00:00:00', 'comprar fresas') + # testExtract_es('insertar cita mañana por la noche a las 23 y' + + # ' tres cuartos', + # '2018-01-14 23:45:00', 'insertar cita') + + def test_extract_ambiguous_time_es(self): + mañana = datetime(2017, 6, 27, 8, 1, 2) + noche = datetime(2017, 6, 27, 20, 1, 2) + mediodia = datetime(2017, 6, 27, 12, 1, 2) + self.assertEqual( + extract_datetime('alimentar a los peces a las 10 en punto', + mañana, lang='es')[0], + datetime(2017, 6, 27, 10, 0, 0)) + self.assertEqual( + extract_datetime('alimentar a los peces a las 10 en punto', + mediodia, lang='es')[0], + datetime(2017, 6, 27, 22, 0, 0)) + self.assertEqual( + extract_datetime('alimentar a los peces a las 10 en punto', + noche, lang='es')[0], + datetime(2017, 6, 27, 22, 0, 0)) + + def test_extract_relativedatetime_es(self): + """ + Test cases for relative datetime + """ + def extractWithFormat(text): + date = datetime(2017, 6, 27, 10, 1, 2) + [extractedDate, leftover] = extract_datetime(text, date, + lang='es') + extractedDate = extractedDate.strftime('%Y-%m-%d %H:%M:%S') + return [extractedDate, leftover] + + def testExtract_es(text, expected_date, expected_leftover): + res = extractWithFormat(normalize(text)) + self.assertEqual(res[0], expected_date, 'per =' + text) + self.assertEqual(res[1], expected_leftover, 'per =' + text) + + testExtract_es('encontrémonos en 5 minutos', + '2017-06-27 10:06:02', 'encontremonos') + testExtract_es('encontrémonos en 5 segundos', + '2017-06-27 10:01:07', 'encontremonos') + testExtract_es('encontrémonos en 1 hora', + '2017-06-27 11:01:02', 'encontremonos') + testExtract_es('encontrémonos en 2 horas', + '2017-06-27 12:01:02', 'encontremonos') + testExtract_es('encontrémonos en 1 minuto', + '2017-06-27 10:02:02', 'encontremonos') + testExtract_es('encontrémonos en 1 segundo', + '2017-06-27 10:01:03', 'encontremonos') + testExtract_es('encontrémonos en 25 horas', + '2017-06-28 11:01:02', 'encontremonos') + def test_spaces_es(self): + """ + Test cases for Spanish remove spaces + """ + self.assertEqual(normalize('esto es un test ', + lang='es'), 'esto es 1 test') + self.assertEqual(normalize(' otro test ', + lang='es'), 'otro test') + self.assertEqual(normalize('esto es otro test ', lang='es', + remove_articles=False), + 'esto es otro test') + self.assertEqual(normalize('esto es un test ', lang='es', + remove_articles=False), 'esto es 1 test') def test_numbers_es(self): - self.assertEqual(normalize("esto es un uno una", lang="es"), - "esto es 1 1 1") - self.assertEqual(normalize("esto es dos tres prueba", lang="es"), - "esto es 2 3 prueba") - self.assertEqual(normalize("esto es cuatro cinco seis prueba", - lang="es"), - "esto es 4 5 6 prueba") - self.assertEqual(normalize(u"siete m�s ocho m�s nueve", lang="es"), - u"7 m�s 8 m�s 9") - self.assertEqual(normalize("diez once doce trece catorce quince", - lang="es"), - "10 11 12 13 14 15") - self.assertEqual(normalize(u"dieciséis diecisiete", lang="es"), - "16 17") - self.assertEqual(normalize(u"dieciocho diecinueve", lang="es"), - "18 19") - self.assertEqual(normalize(u"veinte treinta cuarenta", lang="es"), - "20 30 40") - self.assertEqual(normalize(u"treinta y dos caballos", lang="es"), - "32 caballos") - self.assertEqual(normalize(u"cien caballos", lang="es"), - "100 caballos") - self.assertEqual(normalize(u"ciento once caballos", lang="es"), - "111 caballos") - self.assertEqual(normalize(u"hab�a cuatrocientas una vacas", - lang="es"), - u"hab�a 401 vacas") - self.assertEqual(normalize(u"dos mil", lang="es"), - "2000") - self.assertEqual(normalize(u"dos mil trescientas cuarenta y cinco", - lang="es"), - "2345") - self.assertEqual(normalize( - u"ciento veintitrés mil cuatrocientas cincuenta y seis", - lang="es"), - "123456") - self.assertEqual(normalize( - u"quinientas veinticinco mil", lang="es"), - "525000") - self.assertEqual(normalize( - u"novecientos noventa y nueve mil novecientos noventa y nueve", - lang="es"), - "999999") - - -if __name__ == "__main__": + """ + Test cases for Spanish normalize lang='es' + """ + self.assertEqual(normalize('es un test siete ocho nueve', + lang='es'), 'es 1 test 7 8 9') + self.assertEqual(normalize('test cero diez once doce trece', + lang='es'), 'test 0 10 11 12 13') + self.assertEqual(normalize('test mil seiscientos sesenta y seis', + lang='es', remove_articles=False), + 'test 1000 600 60 y 6') + self.assertEqual(normalize('test siete y medio', + lang='es', remove_articles=False), + 'test 7 y 0.5') + self.assertEqual(normalize('test dos punto nueve', + lang='es'), 'test 2 punto 9') + self.assertEqual(normalize('test ciento nueve', + lang='es', remove_articles=False), + 'test 100 9') + # TODO: Acepta "veinti" + self.assertEqual(normalize('test veinti y 1', + lang='es'), 'test 20 y 1') + self.assertEqual(normalize('test veintiuno y veintisiete', + lang='es'), 'test 21 y 27') + + def test_multiple_numbers_es(self): + self.assertEqual(extract_numbers('esto es la prueba uno dos tres', + lang='es'), [1.0, 2.0, 3.0]) + self.assertEqual(extract_numbers('esto es la prueba cuatro siete' + + ' cuatro', + lang='es'), [4.0, 7.0, 4.0]) + self.assertEqual(extract_numbers('esto es el test cinco seis siete', + lang='es'), [5.0, 6.0, 7.0]) + self.assertEqual(extract_numbers('esto es test diez once doce', + lang='es'), [10.0, 11.0, 12.0]) + self.assertEqual(extract_numbers('test doce gatos veintiuno', + lang='es'), [21.0, 12.0]) + self.assertEqual(extract_numbers('1 perro, siete cerdos, macdonald ' + + 'tenía la granja, 3 bodegas' + + ' 5 macarena', + lang='es'), [1, 7, 3, 5]) + self.assertEqual(extract_numbers('dos cervezas para dos osos', + lang='es'), [2.0, 2.0]) + self.assertEqual(extract_numbers('veinte cuarenta treinta', + lang='es'), [20, 40, 30]) + self.assertEqual(extract_numbers('veinte 20 22', + lang='es'), [20, 20, 22]) + self.assertEqual(extract_numbers('veintidós locos veinte ratas ' + 'veinte gatos', + lang='es'), [22, 20, 20]) + self.assertEqual(extract_numbers('veinte 20 veinte 2', + lang='es'), [20, 20, 20, 2]) + self.assertEqual(extract_numbers('un tercio uno', + lang='es'), [1 / 3, 1]) + # TODO: Fail + # self.assertEqual(extract_numbers('un tercio uno', + # lang='es', ordinals=True), [3]) + # self.assertEqual(extract_numbers('seis millardos', lang='es', + # short_scale=True), [6e9]) + # self.assertEqual(extract_numbers('seis millones', lang='es', + # short_scale=False), [6e6]) + # self.assertEqual(extract_numbers('doce cerdos acompañan a \ + # seis mil millones de bacterias', lang='es', short_scale=True), [6e9, 12]) + + # TODO case when pronounced/extracted number don't match + # fractional numbers often fail + # self.assertEqual(extract_numbers('esto es un siete ocho \ + # nueve y medio test',lang='es'), [7.0, 8.0, 9.5]) + # TODO pronounce number should accept short_scale flag + # self.assertEqual(extract_numbers('two pigs and six trillion + # bacteria', short_scale=False), [2, 6e18]) + # TODO pronounce_number should accept ordinals flag + # self.assertEqual(extract_numbers('thirty second or first', + # ordinals=True), [32, 1]) + + def test_extractdatetime_default_es(self): + default = time(9, 0, 0) + anchor = datetime(2017, 6, 27, 0, 0) + res = extract_datetime('¿Qué tiempo hará en 3 días?', + anchor, lang='es', default_time=default) + self.assertEqual(default, res[0].time()) + + def test_gender_es(self): + """ + Test cases for Spanish grammar , lang='es' + """ + self.assertEqual(get_gender('vaca', lang='es'), 'f') + self.assertEqual(get_gender('caballo', lang='es'), 'm') + self.assertEqual(get_gender('reses', 'las reses', lang='es'), 'f') + self.assertEqual(get_gender('buey', 'el buey come de la hierba', + lang='es'), 'm') + self.assertEqual(get_gender('peces', 'los peces nadan', + lang='es'), 'm') + self.assertEqual(get_gender('tigre', lang='es'), 'm') + self.assertEqual(get_gender('hombres', 'estos hombres comen pasta', + lang='es'), 'm') + self.assertEqual(get_gender('puente', 'el puente', lang='es'), 'm') + self.assertEqual(get_gender('puente', u'este puente ha caído', + lang='es'), 'm') + self.assertEqual(get_gender('escultora', 'esta escultora famosa', + lang='es'), 'f') + self.assertEqual(get_gender('escultor', 'este escultor famoso', + lang='es'), 'm') + self.assertEqual(get_gender('escultores', 'los escultores del Renacimiento', + lang='es'), 'm') + self.assertEqual(get_gender('escultoras', 'las escultoras modernas', + lang='es'), 'f') + self.assertEqual(get_gender('emperatriz', 'la emperatriz murió', + lang='es'), 'f') + self.assertEqual(get_gender('actriz', lang='es'), 'f') + self.assertEqual(get_gender('actor', lang='es'), 'm') + + +if __name__ == '__main__': unittest.main()