From 7977b1503141b12500b6ecb1b63f43aa25e7777e Mon Sep 17 00:00:00 2001 From: Diego Miguel Date: Wed, 3 Apr 2024 15:16:44 +0200 Subject: [PATCH] Revamp project structure and refactor code --- negate/__init__.py | 5 +- negate/base.py | 63 +++ negate/negate.py | 82 ++- negate/negator.py | 481 ----------------- negate/negators/de/__init__.py | 1 + .../{negator_de.py => negators/de/negator.py} | 355 +++++++++++- negate/negators/en/__init__.py | 1 + .../{negator_en.py => negators/en/negator.py} | 506 +++++++++++++++--- negate/negators/supported_languages.py | 41 ++ negate/utils/__init__.py | 0 negate/{ => utils}/tokens.py | 0 negate/version.py | 2 +- tests/conftest.py | 8 +- tests/test_negate.py | 60 ++- 14 files changed, 992 insertions(+), 613 deletions(-) create mode 100644 negate/base.py delete mode 100644 negate/negator.py create mode 100644 negate/negators/de/__init__.py rename negate/{negator_de.py => negators/de/negator.py} (62%) create mode 100644 negate/negators/en/__init__.py rename negate/{negator_en.py => negators/en/negator.py} (60%) create mode 100644 negate/negators/supported_languages.py create mode 100644 negate/utils/__init__.py rename negate/{ => utils}/tokens.py (100%) diff --git a/negate/__init__.py b/negate/__init__.py index 1b8c75f..a3ecb16 100644 --- a/negate/__init__.py +++ b/negate/__init__.py @@ -1,6 +1,5 @@ from .negate import Negator -from .tokens import Token +from .utils.tokens import Token # Don't expose the following submodules. -#del globals()["negate"] -#del globals()["tokens"] +del globals()["negate"] diff --git a/negate/base.py b/negate/base.py new file mode 100644 index 0000000..95d6cd7 --- /dev/null +++ b/negate/base.py @@ -0,0 +1,63 @@ +"""Base negator.""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional + + +class BaseNegator(ABC): + """Base negator. + + Specific negators for different languages must inherit from this class. + """ + + @abstractmethod + def __init__( + self, + use_transformers: Optional[bool] = None, + use_gpu: Optional[bool] = None, + fail_on_unsupported: Optional[bool] = None, + log_level: Optional[int] = None, + **kwargs, + ): + """Instanciate a :obj:`Negator`. + + Args: + use_transformers (:obj:`Optional[bool]`, defaults to :obj:`False`): + Whether to use a Transformer model for POS tagging and + dependency parsing. + use_gpu (:obj:`Optional[bool]`, defaults to :obj:`False`): + Whether to use the GPU, if available. This parameter is ignored + when :param:`use_transformers` is set to :obj:`False`. + fail_on_unsupported (:obj:`Optional[bool]`, defaults to :obj:`False`): + Whether to fail upon non-supported sentences. If set to + :obj:`False`, a warning will be printed, and the sentence will + try to be negated in a best-effort fashion. + log_level (:obj:`Optional[int]`, defaults to ``logging.INFO``): + The level of the logger. + + Raises: + :obj:`RuntimeError`: If the sentence is not supported and + :arg:`fail_on_unsupported` is set to :obj:`True`. + """ + pass + + @abstractmethod + def negate_sentence( + self, + sentence: str, + **kwargs: Dict[str, Any], + ) -> List[str]: + """Negate a sentence. + + Affirmative sentences will be turned into negative ones and vice versa. + + Args: + sentence (:obj:`str`): + The sentence to negate. + **kwargs (:obj:`Dict[str, Any]`): + Additional parameters to pass to the concrete language negator. + + Returns: + :obj:`List[str]`: The negated sentence(s). + """ + pass diff --git a/negate/negate.py b/negate/negate.py index 384d748..b1040a0 100644 --- a/negate/negate.py +++ b/negate/negate.py @@ -1,30 +1,70 @@ -from negate.negator_en import Negator_EN -from negate.negator_de import Negator_DE -from typing import Optional +"""High-level negator.""" + +import importlib +from pathlib import Path +from typing import Dict, List, Optional + +from .negators.supported_languages import Language class Negator: + """High-level negator.""" def __init__( - self, language: str = "EN", - use_transformers: Optional[bool] = None, - use_gpu: Optional[bool] = None, - fail_on_unsupported: Optional[bool] = None, - log_level: Optional[int] = None + self, + language: str, + *, + use_transformers: Optional[bool] = None, + use_gpu: Optional[bool] = None, + fail_on_unsupported: Optional[bool] = None, + log_level: Optional[int] = None, + **kwargs: Dict, ): - self.language = language - if language == "EN": - self.negator = Negator_EN(use_transformers, use_gpu, fail_on_unsupported, log_level) - elif language == "DE": - self.negator = Negator_DE(use_transformers, use_gpu, fail_on_unsupported, log_level) - else: - raise ValueError("Language not supported, supported languages are EN and DE") + """Instanciate a :obj:`Negator`. - def negate_sentence( - self, - sentence: str, - *args, **kwargs - ) -> set[str] | str: - return self.negator.negate_sentence(sentence, *args, **kwargs) + Args: + use_transformers (:obj:`Optional[bool]`, defaults to :obj:`False`): + Whether to use a Transformer model for POS tagging and + dependency parsing. + .. note:: + + When set to :obj:`True` the model `en_core_web_trf + `__ is used. + use_gpu (:obj:`Optional[bool]`, defaults to :obj:`False`): + Whether to use the GPU, if available. This parameter is + ignored when :param:`use_transformers` is set to :obj:`False`. + fail_on_unsupported (:obj:`Optional[bool]`, defaults to :obj:`False`): + Whether to fail upon non-supported sentences. If set to + :obj:`False`, a warning will be printed, and the negator + will try to negate the sentence in a best-effort fashion. + log_level (:obj:`Optional[int]`, defaults to ``logging.INFO``): + The level of the logger. + kwargs (:obj:`Dict`): + Any other parameters to pass to the language-specific + negators. + + Raises: + :obj:`ValueError`: If the specified language is not supported. + """ + if not Language.is_supported(language): + raise ValueError( + f'The language "{language}" is currently not supported.\n' + f"Valid values are {Language.get_all()}" + ) + self.language = language + self.negator = getattr( + importlib.import_module( + f".negators.{language}.negator", package=Path(__file__).parent.name + ), + "Negator", + )( + use_transformers=use_transformers, + use_gpu=use_gpu, + fail_on_unsupported=fail_on_unsupported, + log_level=log_level, + **kwargs, + ) + def negate_sentence(self, sentence: str, **kwargs) -> List[str]: + return self.negator.negate_sentence(sentence, **kwargs) diff --git a/negate/negator.py b/negate/negator.py deleted file mode 100644 index 647cb6e..0000000 --- a/negate/negator.py +++ /dev/null @@ -1,481 +0,0 @@ -from abc import ABC, abstractmethod - -from negate.tokens import Token -import spacy -from spacy.symbols import AUX, NOUN, PRON, VERB, neg -from spacy.tokens import Doc as SpacyDoc -from spacy.tokens import Token as SpacyToken -from typing import Dict, List, Optional -import logging - -class Negator_ABC(ABC): - - def __init__( - self, - use_transformers: Optional[bool] = None, - use_gpu: Optional[bool] = None, - fail_on_unsupported: Optional[bool] = None, - log_level: Optional[int] = None - ): - if use_transformers is None: - use_transformers = False - if use_gpu is None: - use_gpu = False - if fail_on_unsupported is None: - fail_on_unsupported = False - if log_level is None: - log_level = logging.INFO - - # Set up logger. - logging.basicConfig( - format="%(levelname)s: %(message)s", - level=log_level - ) - self.logger = logging.getLogger(__class__.__name__) - self.fail_on_unsupported = fail_on_unsupported - # Load spaCy model. If not available locally, the model will be first - # installed. - if use_transformers and use_gpu: - spacy.require_gpu() - else: - spacy.require_cpu() - self.spacy_model = self._initialize_spacy_model(use_transformers) - # Store whether tokens have a whitespace after them. This is used later - # on for de-tokenization. - SpacyToken.set_extension("has_space_after", default=True, force=True) - - @abstractmethod - def negate_sentence( - self, - sentence: str, - prefer_contractions: Optional[bool] = None - ) -> str: - pass - - @abstractmethod - def _initialize_spacy_model( - self, - use_transformers: bool, - **kwargs - ) -> spacy.language.Language: - pass - - def _parse(self, string_: str) -> SpacyDoc: - """Parse a string. - - This method cleans up the string and tokenizes it. The resulting - :obj:`SpacyDoc` object, also includes information about whitespaces to - facilitate de-tokenization later on. - - Args: - string_ (:obj:`str`): - The string to parse. - - Returns: - :obj:`SpacyDoc`: The string tokenized into a spaCy document. - """ - # Remove extra whitespaces and other non-printable chars. - string_ = self._remove_extra_whitespaces(string_) - # Tokenize. - doc = self.spacy_model(string_) - i = 0 # Used to determine whitespaces. - for tk in doc: - has_space_after: bool = ( - i + len(tk) < len(string_) and (string_[i + len(tk)] == " ") - ) - tk._.has_space_after = has_space_after - i += len(tk) + int(has_space_after) - return doc - - def _get_entry_point( - self, - doc: SpacyDoc, - contains_inversion: bool - ) -> Optional[SpacyToken]: - """Choose a suitable verb to attempt negating first, if any. - - Args: - doc (:obj:`SpacyDoc`): - The spaCy document in which to find the entry point. - contains_inversion (:obj:`bool`): - Whether the sentence contains an inversion or not. - - Returns: - :obj:`Optional[SpacyToken]`: The chosen entry point (verb), or - :obj:`None` if the sentence has no root, or contains no verbs. - """ - if contains_inversion: - entry_point = [tk for tk in doc - if self._is_aux(tk) or self._is_verb(tk)] - if entry_point: - return entry_point[0] - root = self._get_root(doc) - if root is None: # nothing we can do - return None - # If the root token is not an AUX or a VERB, look for an AUX or - # VERB in its children. - if not (self._is_aux(root) or self._is_verb(root)): - entry_point = None - if root.children: - entry_point = [tk for tk in root.children - if self._is_aux(tk) or self._is_verb(tk)] - # No AUX or VERB found in the root children -> Take the first - # AUX or VERB in the sentence, if any. - if not entry_point: - entry_point = [tk for tk in doc - if self._is_aux(tk) or self._is_verb(tk)] - return entry_point[0] if entry_point else None - return root - - @staticmethod - def _get_root(doc: SpacyDoc) -> Optional[SpacyToken]: - """Get the root token in a spaCy document, if any. - - Args: - doc (:obj:`SpacyDoc`): - The spaCy document to get the root from. - - Returns: - :obj:`Optional[SpacyToken]`: The root token, or :obj:`None` if the - sentence has no root. - """ - root = [tk for tk in doc if tk.dep_ == "ROOT"] - return root[0] if root else None - - @staticmethod - def _get_first_negation_particle( - doc: SpacyDoc - ) -> Optional[SpacyToken]: - """Get the first negation particle in a document. - - Args: - doc (:obj:`SpacyDoc`): - The spaCy document containing the token. - Returns: - :obj:`Optional[SpacyToken]`: The first negation particle in the - sentence, or :obj:`None` if no such particle exists. - """ - negation = [tk for tk in doc if tk.dep == neg] - return negation[0] if negation else None - - @staticmethod - def _get_negated_child( - token: SpacyToken, - min_index: int = 0 - ) -> Optional[SpacyToken]: - """Get the negated child of a token, if any. - - Only the first negated child with an index equal or greater than - :param:`min_index` is returned. - - Args: - token (:obj:`SpacyToken`): - The spaCy token to get the negated child from. - min_index (:obj:`int`, defaults to ``0``): - The minimum index (inclusive) the negated child must have in - order to be returned. Useful to consider children on the left - or the right of the passed token. - - Returns: - :obj:`Optional[SpacyToken]`: The negated child of :param:`token`, or - :obj:`None` if no negated child was found. - """ - if not token: - return None - min_index = max(0, min_index) # prevent negative values - child = [child for child in token.children if child.dep == neg - and child.i >= min_index] - return child[0] if child else None - - def _get_aux_child( - self, - token: SpacyToken, - min_index: int = 0 - ) -> Optional[SpacyToken]: - """Get the child of a token that is an auxiliary verb, if any. - - Only the first child that is an auxiliary with an index equal or greater - than :param:`min_index` is returned. - - Args: - token (:obj:`SpacyToken`): - The spaCy token to get the auxiliary children from. - min_index (:obj:`int`, defaults to ``0``): - The minimum index (inclusive) the auxiliary child must have in - order to be returned. Useful to consider children on the left - or the right of the passed token. - - Returns: - :obj:`Optional[SpacyToken]`: The auxiliary child of :param:`token`, - or :obj:`None` if no auxiliary child was found. - """ - if not token: - return None - min_index = max(0, min_index) # prevent negative values - child = [child for child in token.children if self._is_aux(child) - and child.i >= min_index] - return child[0] if child else None - - def _get_first_aux_or_verb( - self, - doc: SpacyDoc - ) -> Optional[SpacyToken]: - """Get the first verb in a spaCy document. - - The verb can be an auxiliary or not. - - Args: - doc (:obj:`SpacyDoc`): - The spaCy document to get the first verb from. - - Returns: - :obj:`Optional[SpacyToken]`: The first verb in the document or - :obj:`None` if no verb was found. - """ - aux = [tk for tk in doc if self._is_aux(tk) or self._is_verb(tk)] - return aux[0] if aux else None - - @staticmethod - def _get_parent( - token: SpacyToken, - doc: SpacyDoc - ) -> Optional[SpacyToken]: - """Get the parent of a given token, if any. - - Args: - token (:obj:`SpacyToken`): - The spaCy token to get the parent of. - doc (:obj:`SpacyDoc`): - The spaCy document in which to find for the parent. - - Returns: - :obj:`Optional[SpacyToken]`: The parent of the token, or :obj:`None` - if the token has no parent. - """ - if not token: - return None - parent = [ - potential_parent - for potential_parent in doc - if token in potential_parent.children - ] - return parent[0] if parent else None - - @staticmethod - def _is_aux(token: SpacyToken) -> bool: - """Determine whether a token is an auxiliary verb. - - Args: - token (:obj:`SpacyToken`): - The spaCy token to determine whether it is auxiliary. - - Returns: - :obj:`bool`: :obj:`True` if the token is an auxiliary verb, - otherwise :obj:`False`. - """ - if not token: - return False - return token.pos == AUX - - @staticmethod - def _is_pronoun(token: SpacyToken) -> bool: - """Determine whether a token is a pronoun. - - Args: - token (:obj:`SpacyToken`): - The spaCy token to determine whether it is a pronoun. - - Returns: - :obj:`bool`: :obj:`True` if the token is a pronoun, - otherwise :obj:`False`. - """ - if not token: - return False - return token.pos == PRON - - @staticmethod - def _is_noun(token: SpacyToken) -> bool: - """Determine whether a token is a noun. - - Args: - token (:obj:`SpacyToken`): - The spaCy token to determine whether it is a noun. - - Returns: - :obj:`bool`: :obj:`True` if the token is a noun, - otherwise :obj:`False`. - """ - if not token: - return False - return token.pos == NOUN - - @staticmethod - def _is_verb(token: SpacyToken) -> bool: - """Determine whether a token is a non-auxiliary verb. - - .. note:: - - If you want to check if a token is either an auxiliary *or* a verb, - you can use this method in combination with :meth:`Negator._is_aux`. - - Args: - token (:obj:`SpacyToken`): - The spaCy token to determine whether it is a non-auxiliary verb. - - Returns: - :obj:`bool`: :obj:`True` if the token is a non-auxiliary verb, - otherwise :obj:`False`. - """ - if not token: - return False - return token.pos == VERB - - @staticmethod - def _capitalize_first_letter(string_: str) -> str: - """Uppercase the first letter of a string. - - The capitalization of the rest of the string remains unchanged. - - Args: - string_ (:obj:`str`): - The string whose first letter to uppercase. - - Returns: - :obj:`str`: The string with its first letter uppercased. - """ - if not string_: - return "" - return f"{string_[0].upper()}{string_[1:]}" - - @staticmethod - def _remove_extra_whitespaces(string_: str) -> str: - """Remove any duplicated whitespaces in a string. - - Args: - string_ (:obj:`str`): - The string in which to remove any extra whitespaces. - - Returns: - :obj:`str`: The string with one whitespace at most between words. - """ - if not string_: - return "" - return " ".join(string_.split()) - - @staticmethod - def _find_number(token: SpacyToken) -> str: - """find the number type of token i.e. plural or singular""" - result = "" - morph_dict = token.morph.to_dict() - if "Number" in morph_dict: - result = morph_dict["Number"] - return result - - @staticmethod - def _find_case(token: SpacyToken) -> str: - """find the case type of token e.g. nominative, accusative, etc.""" - result = "" - morph_dict = token.morph.to_dict() - if "Case" in morph_dict: - result = morph_dict["Case"] - return result - - @staticmethod - def _find_gender(token: SpacyToken) -> str: - """find the gender type of token i.e. feminine, masculine, neutral.""" - result = "" - morph_dict = token.morph.to_dict() - if "Gender" in morph_dict: - result = morph_dict["Gender"] - return result - - @staticmethod - def _find_last_word(doc: SpacyDoc) -> SpacyToken: - """find the last word""" - if doc[-1].pos_ == "PUNCT": - return doc[-2] - return doc[-1] - - @staticmethod - def _is_full_sentence(root: SpacyToken) -> bool: - """Check if it is a full sentence""" - subject = [x for x in root.children if x.dep_ == "sb"] - if not subject: - return False - return subject[0].i < root.i - - @staticmethod - def _find_verb_form(token: SpacyToken) -> str: - """find the verb form of token""" - result = "" - morph_dict = token.morph.to_dict() - if "VerbForm" in morph_dict: - result = morph_dict["VerbForm"] - return result - - @staticmethod - def _find_definite(token: SpacyToken) -> str: - """find the definite type of token i.e. definite or indefinite""" - result = "" - morph_dict = token.morph.to_dict() - if "Definite" in morph_dict: - result = morph_dict["Definite"] - return result - - def _compile_sentence( - self, - doc: SpacyDoc, - remove_tokens: Optional[List[int]] = None, - add_tokens: Optional[Dict[int, Token]] = None - ) -> str: - """Process and de-tokenize a spaCy document back into a string. - - Args: - doc (:obj:`SpacyDoc`): - The spaCy document. - remove_tokens (:obj:`Optional[List[int]]`): - The indexes of the tokens to remove from the document, if any. - add_tokens (:obj:`Optional[Dict[int, Token]]`): - The tokens to add to the document, if any. These are specified - as a dictionary whose keys are the indexes in which to insert - the new tokens, which are the respective values. - - Returns: - :obj:`str`: The resulting, de-tokenized string including the - removal/addition of tokens, if any. - """ - if remove_tokens is None: - remove_tokens = [] - if add_tokens is None: - add_tokens = {} - else: - add_tokens = dict(sorted(add_tokens.items())) # sort by index - tokens = [Token(tk.text, tk._.has_space_after) for tk in doc] - for i in remove_tokens: - tokens[i] = Token(text="", has_space_after=False) - for count, item in enumerate(add_tokens.items()): - i, tk = item - tokens.insert(i + count, tk) - return self._capitalize_first_letter( - self._remove_extra_whitespaces( - "".join([f"{tk.text}{' ' * int(tk.has_space_after)}" - for tk in tokens]) - ) - ) - - def _handle_unsupported(self, fail: Optional[bool] = None): - """Handle behavior upon unsupported sentences. - - Args: - fail (:obj:`Optional[bool]`): - Whether to raise an exception with unsupported sentences or not. - Raises: - :obj:`RuntimeError`: If :arg:`fail_on_unsupported` is set to - :obj:`True`. - """ - if fail is None: - fail = self.fail_on_unsupported - if fail: - raise RuntimeError("sentence not supported") - else: - self.logger.warning("Sentence not supported. Output might be arbitrary.") diff --git a/negate/negators/de/__init__.py b/negate/negators/de/__init__.py new file mode 100644 index 0000000..4de22ab --- /dev/null +++ b/negate/negators/de/__init__.py @@ -0,0 +1 @@ +LANG_NAME: str = "German" diff --git a/negate/negator_de.py b/negate/negators/de/negator.py similarity index 62% rename from negate/negator_de.py rename to negate/negators/de/negator.py index ef8eb45..86fe576 100644 --- a/negate/negator_de.py +++ b/negate/negators/de/negator.py @@ -1,24 +1,34 @@ -from negate.negator import Negator_ABC -from typing import Dict, Optional, Union +"""German Negation.""" + import importlib +import logging import os import sys from contextlib import contextmanager +from typing import Any, Dict, List, Optional, Set, Union import spacy from DERBI.derbi import DERBI +from negate.base import BaseNegator +from negate.utils.tokens import Token from spacy.lang.de import German +from spacy.symbols import AUX, VERB from spacy.tokens import Doc as SpacyDoc from spacy.tokens import Token as SpacyToken -from negate.tokens import Token - -class Negator_DE(Negator_ABC): +class Negator(BaseNegator): + """Negator for the German language.""" - def __init__(self, use_transformers: Optional[bool] = None, use_gpu: Optional[bool] = None, - fail_on_unsupported: Optional[bool] = None, log_level: Optional[int] = None): - """Instanciate a :obj:`Negator`. + def __init__( + self, + use_transformers: Optional[bool] = None, + use_gpu: Optional[bool] = None, + fail_on_unsupported: Optional[bool] = None, + log_level: Optional[int] = None, + **kwargs, + ): + """Instanciate a German Negator. Args: use_transformers (:obj:`Optional[bool]`, defaults to :obj:`False`): @@ -43,7 +53,33 @@ def __init__(self, use_transformers: Optional[bool] = None, use_gpu: Optional[bo :obj:`RuntimeError`: If the sentence is not supported and :arg:`fail_on_unsupported` is set to :obj:`True`. """ - super().__init__(use_transformers, use_gpu, fail_on_unsupported, log_level) + if use_transformers is None: + use_transformers = False + if use_gpu is None: + use_gpu = False + if fail_on_unsupported is None: + fail_on_unsupported = False + if log_level is None: + log_level = logging.INFO + + # Set up logger. + logging.basicConfig( + format="%(levelname)s: %(message)s", + level=log_level + ) + self.logger = logging.getLogger(__class__.__name__) + self.fail_on_unsupported = fail_on_unsupported + # Load spaCy model. If not available locally, the model will be first + # installed. + if use_transformers and use_gpu: + spacy.require_gpu() + else: + spacy.require_cpu() + self.spacy_model = self._initialize_spacy_model(use_transformers) + # Store whether tokens have a whitespace after them. This is used later + # on for de-tokenization. + SpacyToken.set_extension("has_space_after", default=True, force=True) + self.derbi = DERBI(self.spacy_model) self._de_initialize_nicht_replace_table() self._de_initialize_kein_table() @@ -55,13 +91,37 @@ def __init__(self, use_transformers: Optional[bool] = None, use_gpu: Optional[bo self._de_initialize_reverse_two_part_phrase_table() def negate_sentence( - self, - sentence: str, - strategy: tuple = ("kein", "nicht", "phrase"), - ) -> set[str]: + self, + sentence: str, + **kwargs: Dict[str, Any], + ) -> List[str]: + """Negate a sentence. + + Affirmative sentences will be turned into negative ones and vice versa. + + .. note:: + + Currently, only sentences that contain at least one verb are + supported. The output of non-supported sentences might be arbitrary. + + Args: + sentence (:obj:`str`): + The sentence to negate. + strategy (:obj:`Optional[List[str]]`, defaults to ``["kein", "nicht", "phrase"]``): + The negation strategy to use, i.e., whether to negate by adding + the "kein" or "nicht" negation particles, or perform a phrase + negation. + + Returns: + :obj:`List[str]`: The negated sentence(s). + """ results = set() if not sentence: - return set() + return [] + + strategy = kwargs.get("strategy") + if strategy is None: + strategy = ["kein", "nicht", "phrase"] for contraction in self._contraction_table: sentence = sentence.replace(contraction, self._contraction_table[contraction]) @@ -71,11 +131,11 @@ def negate_sentence( if not root: self._handle_unsupported() - return set() + return [] un_negated = self._un_negate_sentence(sentence, doc) if un_negated: - return un_negated + return list(un_negated) # Edge case "weder noch" exception_list = [" weder ", "Weder ", " sowohl ", "Sowohl"] @@ -97,9 +157,256 @@ def negate_sentence( final_result = final_result.replace(contraction, self._reverse_contraction_table[contraction]) final_results.add(final_result) - return final_results + return list(final_results) + + def _parse(self, string_: str) -> SpacyDoc: + """Parse a string. + + This method cleans up the string and tokenizes it. The resulting + :obj:`SpacyDoc` object, also includes information about whitespaces to + facilitate de-tokenization later on. + + Args: + string_ (:obj:`str`): + The string to parse. + + Returns: + :obj:`SpacyDoc`: The string tokenized into a spaCy document. + """ + # Remove extra whitespaces and other non-printable chars. + string_ = self._remove_extra_whitespaces(string_) + # Tokenize. + doc = self.spacy_model(string_) + i = 0 # Used to determine whitespaces. + for tk in doc: + has_space_after: bool = ( + i+len(tk) < len(string_) and (string_[i+len(tk)] == " ") + ) + tk._.has_space_after = has_space_after + i += len(tk) + int(has_space_after) + return doc + + def _get_entry_point( + self, + doc: SpacyDoc, + contains_inversion: bool + ) -> Optional[SpacyToken]: + """Choose a suitable verb to attempt negating first, if any. + + Args: + doc (:obj:`SpacyDoc`): + The spaCy document in which to find the entry point. + contains_inversion (:obj:`bool`): + Whether the sentence contains an inversion or not. + + Returns: + :obj:`Optional[SpacyToken]`: The chosen entry point (verb), or + :obj:`None` if the sentence has no root, or contains no verbs. + """ + if contains_inversion: + entry_point = [tk for tk in doc + if self._is_aux(tk) or self._is_verb(tk)] + if entry_point: + return entry_point[0] + root = self._get_root(doc) + if root is None: # nothing we can do + return None + # If the root token is not an AUX or a VERB, look for an AUX or + # VERB in its children. + if not (self._is_aux(root) or self._is_verb(root)): + entry_point = None + if root.children: + entry_point = [tk for tk in root.children + if self._is_aux(tk) or self._is_verb(tk)] + # No AUX or VERB found in the root children -> Take the first + # AUX or VERB in the sentence, if any. + if not entry_point: + entry_point = [tk for tk in doc + if self._is_aux(tk) or self._is_verb(tk)] + return entry_point[0] if entry_point else None + return root + + def _get_root(self, doc: SpacyDoc) -> Optional[SpacyToken]: + """Get the root token in a spaCy document, if any. + + Args: + doc (:obj:`SpacyDoc`): + The spaCy document to get the root from. + + Returns: + :obj:`Optional[SpacyToken]`: The root token, or :obj:`None` if the + sentence has no root. + """ + root = [tk for tk in doc if tk.dep_ == "ROOT"] + return root[0] if root else None + + def _capitalize_first_letter(self, string_: str) -> str: + """Uppercase the first letter of a string. - def _un_negate_sentence(self, sentence: str, doc: SpacyDoc) -> set[str]: + The capitalization of the rest of the string remains unchanged. + + Args: + string_ (:obj:`str`): + The string whose first letter to uppercase. + + Returns: + :obj:`str`: The string with its first letter uppercased. + """ + if not string_: + return "" + return f"{string_[0].upper()}{string_[1:]}" + + def _remove_extra_whitespaces(self, string_: str) -> str: + """Remove any duplicated whitespaces in a string. + + Args: + string_ (:obj:`str`): + The string in which to remove any extra whitespaces. + + Returns: + :obj:`str`: The string with one whitespace at most between words. + """ + if not string_: + return "" + return " ".join(string_.split()) + + def _is_aux(self, token: SpacyToken) -> bool: + """Determine whether a token is an auxiliary verb. + + Args: + token (:obj:`SpacyToken`): + The spaCy token to determine whether it is auxiliary. + + Returns: + :obj:`bool`: :obj:`True` if the token is an auxiliary verb, + otherwise :obj:`False`. + """ + if not token: + return False + return token.pos == AUX + + def _is_verb(self, token: SpacyToken) -> bool: + """Determine whether a token is a non-auxiliary verb. + + .. note:: + + If you want to check if a token is either an auxiliary *or* a verb, + you can use this method in combination with :meth:`Negator._is_aux`. + + Args: + token (:obj:`SpacyToken`): + The spaCy token to determine whether it is a non-auxiliary verb. + + Returns: + :obj:`bool`: :obj:`True` if the token is a non-auxiliary verb, + otherwise :obj:`False`. + """ + if not token: + return False + return token.pos == VERB + + @staticmethod + def _is_full_sentence(root: SpacyToken) -> bool: + """Check if it is a full sentence""" + subject = [x for x in root.children if x.dep_ == "sb"] + if not subject: + return False + return subject[0].i < root.i + + @staticmethod + def _find_verb_form(token: SpacyToken) -> str: + """find the verb form of token""" + result = "" + morph_dict = token.morph.to_dict() + if "VerbForm" in morph_dict: + result = morph_dict["VerbForm"] + return result + + @staticmethod + def _find_definite(token: SpacyToken) -> str: + """find the definite type of token i.e. definite or indefinite""" + result = "" + morph_dict = token.morph.to_dict() + if "Definite" in morph_dict: + result = morph_dict["Definite"] + return result + + @staticmethod + def _find_number(token: SpacyToken) -> str: + """find the number type of token i.e. plural or singular""" + result = "" + morph_dict = token.morph.to_dict() + if "Number" in morph_dict: + result = morph_dict["Number"] + return result + + @staticmethod + def _find_case(token: SpacyToken) -> str: + """find the case type of token e.g. nominative, accusative, etc.""" + result = "" + morph_dict = token.morph.to_dict() + if "Case" in morph_dict: + result = morph_dict["Case"] + return result + + @staticmethod + def _find_gender(token: SpacyToken) -> str: + """find the gender type of token i.e. feminine, masculine, neutral.""" + result = "" + morph_dict = token.morph.to_dict() + if "Gender" in morph_dict: + result = morph_dict["Gender"] + return result + + @staticmethod + def _find_last_word(doc: SpacyDoc) -> SpacyToken: + """find the last word""" + if doc[-1].pos_ == "PUNCT": + return doc[-2] + return doc[-1] + + def _compile_sentence( + self, + doc: SpacyDoc, + remove_tokens: Optional[List[int]] = None, + add_tokens: Optional[Dict[int, Token]] = None + ) -> str: + """Process and de-tokenize a spaCy document back into a string. + + Args: + doc (:obj:`SpacyDoc`): + The spaCy document. + remove_tokens (:obj:`Optional[List[int]]`): + The indexes of the tokens to remove from the document, if any. + add_tokens (:obj:`Optional[Dict[int, Token]]`): + The tokens to add to the document, if any. These are specified + as a dictionary whose keys are the indexes in which to insert + the new tokens, which are the respective values. + + Returns: + :obj:`str`: The resulting, de-tokenized string including the + removal/addition of tokens, if any. + """ + if remove_tokens is None: + remove_tokens = [] + if add_tokens is None: + add_tokens = {} + else: + add_tokens = dict(sorted(add_tokens.items())) # sort by index + tokens = [Token(tk.text, tk._.has_space_after) for tk in doc] + for i in remove_tokens: + tokens[i] = Token(text="", has_space_after=False) + for count, item in enumerate(add_tokens.items()): + i, tk = item + tokens.insert(i+count, tk) + return self._capitalize_first_letter( + self._remove_extra_whitespaces( + "".join([f"{tk.text}{' '*int(tk.has_space_after)}" + for tk in tokens]) + ) + ) + + def _un_negate_sentence(self, sentence: str, doc: SpacyDoc) -> Set[str]: # un negate phrases un_negated_sentences_phrases = self._reverse_negate_phrases(sentence) if un_negated_sentences_phrases: @@ -143,7 +450,7 @@ def _un_negate_sentence(self, sentence: str, doc: SpacyDoc) -> set[str]: def _negate_kein( self, doc: SpacyDoc - ) -> set[str]: + ) -> Set[str]: results = set() kein_noun_dicts = [self._generate_kein_dict(x) for x in doc if x.pos_ == "NOUN"] kein_noun_dicts = [x for x in kein_noun_dicts if x] @@ -162,7 +469,7 @@ def _negate_nicht( self, doc: SpacyDoc, root: Union[Token, SpacyToken] - ) -> set[str]: + ) -> Set[str]: results = set() svps = any(x.dep_ == "svp" for x in doc) adps = any(x.pos_ == "ADP" for x in doc) @@ -187,7 +494,7 @@ def _negate_nicht( return results - def _negate_phrases(self, sentence: str) -> set[str]: + def _negate_phrases(self, sentence: str) -> Set[str]: results = set() for phrase in self._phrase_table.keys(): @@ -205,7 +512,7 @@ def _negate_phrases(self, sentence: str) -> set[str]: return results - def _reverse_negate_phrases(self, sentence: str) -> set[str]: + def _reverse_negate_phrases(self, sentence: str) -> Set[str]: for phrase in self._reverse_phrase_table.keys(): if phrase in sentence: @@ -284,7 +591,7 @@ def _negate_adposition( root: Union[Token, SpacyToken], doc: SpacyDoc, dont_negate_at_end: bool, # noqa FBT001 - ) -> set[str]: + ) -> Set[str]: adpositions = [x for x in doc if x.pos_ == "ADP"] results = set() for adposition in adpositions: @@ -319,7 +626,7 @@ def _negate_verb_part( ) -> str: return self._negate_before_token(self._find_last_word(doc), doc) - def _generate_kein_dict(self, noun_token: Union[Token, SpacyToken]) -> dict | bool: + def _generate_kein_dict(self, noun_token: Union[Token, SpacyToken]) -> Union[bool, Dict]: children = list(noun_token.children) # No children to the right children = [child for child in children if child.i < noun_token.i] diff --git a/negate/negators/en/__init__.py b/negate/negators/en/__init__.py new file mode 100644 index 0000000..24b7316 --- /dev/null +++ b/negate/negators/en/__init__.py @@ -0,0 +1 @@ +LANG_NAME: str = "English" diff --git a/negate/negator_en.py b/negate/negators/en/negator.py similarity index 60% rename from negate/negator_en.py rename to negate/negators/en/negator.py index b65a268..38aec0a 100644 --- a/negate/negator_en.py +++ b/negate/negators/en/negator.py @@ -1,26 +1,34 @@ -"""Negation tools.""" +"""English Negation.""" + import importlib +import logging import os import sys from contextlib import contextmanager -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import spacy from lemminflect import getInflection, getLemma +from negate.base import BaseNegator +from negate.utils.tokens import Token +from negate.version import EN_CORE_WEB_MD_VERSION, EN_CORE_WEB_TRF_VERSION +from spacy.symbols import AUX, NOUN, PRON, VERB, neg from spacy.tokens import Doc as SpacyDoc from spacy.tokens import Token as SpacyToken -from negate.negator import Negator_ABC -from negate.tokens import Token -from negate.version import EN_CORE_WEB_MD_VERSION, EN_CORE_WEB_TRF_VERSION - -class Negator_EN(Negator_ABC): +class Negator(BaseNegator): """Negator for the English language.""" - def __init__(self, use_transformers: Optional[bool] = None, use_gpu: Optional[bool] = None, - fail_on_unsupported: Optional[bool] = None, log_level: Optional[int] = None): - """Instanciate a :obj:`Negator`. + def __init__( + self, + use_transformers: Optional[bool] = None, + use_gpu: Optional[bool] = None, + fail_on_unsupported: Optional[bool] = None, + log_level: Optional[int] = None, + **kwargs, + ): + """Instanciate an English Negator. Args: use_transformers (:obj:`Optional[bool]`, defaults to :obj:`False`): @@ -45,15 +53,40 @@ def __init__(self, use_transformers: Optional[bool] = None, use_gpu: Optional[bo :obj:`RuntimeError`: If the sentence is not supported and :arg:`fail_on_unsupported` is set to :obj:`True`. """ - super().__init__(use_transformers, use_gpu, fail_on_unsupported, log_level) + if use_transformers is None: + use_transformers = False + if use_gpu is None: + use_gpu = False + if fail_on_unsupported is None: + fail_on_unsupported = False + if log_level is None: + log_level = logging.INFO + + # Set up logger. + logging.basicConfig( + format="%(levelname)s: %(message)s", + level=log_level + ) + self.logger = logging.getLogger(__class__.__name__) + self.fail_on_unsupported = fail_on_unsupported + # Load spaCy model. If not available locally, the model will be first + # installed. + if use_transformers and use_gpu: + spacy.require_gpu() + else: + spacy.require_cpu() + self.spacy_model = self._initialize_spacy_model(use_transformers) # Initialize AUX negation dictionary. self._initialize_aux_negations() + # Store whether tokens have a whitespace after them. This is used later + # on for de-tokenization. + SpacyToken.set_extension("has_space_after", default=True, force=True) def negate_sentence( - self, - sentence: str, - prefer_contractions: Optional[bool] = None - ) -> str: + self, + sentence: str, + **kwargs: Dict[str, Any], + ) -> List[str]: """Negate a sentence. Affirmative sentences will be turned into negative ones and vice versa. @@ -72,10 +105,12 @@ def negate_sentence( ``"haven't"``, ``"wouldn't"``, etc.). Returns: - :obj:`str`: The negated sentence. + :obj:`List[str]`: The negated sentence. """ if not sentence: - return "" + return [] + + prefer_contractions = kwargs.get("prefer_contractions") if prefer_contractions is None: prefer_contractions = True @@ -85,7 +120,7 @@ def negate_sentence( if not root or not self._is_sentence_supported(doc): self._handle_unsupported() if not root: # Don't even bother trying :) - return sentence + return [sentence] # Any negations we can remove? (e.g.: "I don't know.", "They won't # complain.", "He has not done it.", etc.). negation = self._get_first_negation_particle(doc) @@ -94,10 +129,10 @@ def negate_sentence( # complicates things. first_aux_or_verb = self._get_first_aux_or_verb(doc) while (negation and first_aux_or_verb - and first_aux_or_verb.tag_ not in ("VB", "VBG") - and negation.i < first_aux_or_verb.i): + and first_aux_or_verb.tag_ not in("VB", "VBG") + and negation.i < first_aux_or_verb.i): # Search for another negation, if any. - negation = self._get_negated_child(root, min_index=negation.i + 1) + negation = self._get_negated_child(root, min_index=negation.i+1) aux_child = self._get_aux_child(root) if negation: remove, add = self._handle_ca_wo(root, aux_child, negation=negation) @@ -120,11 +155,11 @@ def negate_sentence( remove = [root.i, negation.i] # Correctly handle space in e.g., "He hasn't been doing great." if negation.i < root.i and negation.i > 0: - doc[negation.i - 1]._.has_space_after = negation._.has_space_after + doc[negation.i-1]._.has_space_after = negation._.has_space_after # Correctly handle space in e.g., "I'm not doing great." vs. # "I am not doing great." space_before = " " * int(root.i > 0 - and doc[root.i - 1]._.has_space_after) + and doc[root.i-1]._.has_space_after) # Negation can come before ("She will not ever go.") or after # the root ("She will not."). Space after is different in each # case. @@ -144,11 +179,11 @@ def negate_sentence( f"{self.conjugate_verb(root.text.lower(), aux_child.tag_)}", has_space_after=root._.has_space_after ) - return self._compile_sentence( + return [self._compile_sentence( doc, remove_tokens=remove, add_tokens=add - ) + )] # AUX as ROOT (e.g.: "I'm excited.") or ROOT children e.g., # "I do think...". @@ -188,8 +223,7 @@ def negate_sentence( )} ) - @staticmethod - def conjugate_verb(verb: str, tag: str) -> str: + def conjugate_verb(self, verb: str, tag: str) -> str: """Conjugate a verb to a tense. Args: @@ -206,8 +240,7 @@ def conjugate_verb(verb: str, tag: str) -> str: conjugated_verb: Tuple[str] = getInflection(verb, tag) return conjugated_verb[0] if conjugated_verb else verb - @staticmethod - def get_base_verb(verb: str) -> str: + def get_base_verb(self, verb: str) -> str: """Get the base form (infinitive) of a verb. Args: @@ -222,14 +255,14 @@ def get_base_verb(verb: str) -> str: return base_verb[0] if base_verb else verb def negate_aux( - self, - auxiliary_verb: str, - prefer_contractions: Optional[bool] = None, - fail_on_unsupported: Optional[bool] = None + self, + auxiliary_verb: str, + prefer_contractions: Optional[bool] = None, + fail_on_unsupported: Optional[bool] = None ) -> Optional[str]: """Get the negated form of an auxiliary verb. - . note:: + .. note:: This method negates unidirectionally from affirmative to negative. In other words, :param:`auxiliary_verb` must be a non-negated @@ -264,16 +297,16 @@ def negate_aux( return negated_aux def _negate_aux_in_doc( - self, - aux: Union[Token, SpacyToken], - doc: SpacyDoc, - contains_inversion: bool, - prefer_contractions: Optional[bool] = None, - fail_on_unsupported: Optional[bool] = None + self, + aux: Union[Token, SpacyToken], + doc: SpacyDoc, + contains_inversion: bool, + prefer_contractions: Optional[bool] = None, + fail_on_unsupported: Optional[bool] = None ) -> str: """Negate an auxiliary within a sentence. - . note:: + .. note:: This method, differently from :meth:`Negator.negate_aux`, is bidirectional. That means that the passed auxiliary can be in its @@ -323,7 +356,7 @@ def _negate_aux_in_doc( if aux.text.lower() == "'s": parent = self._get_parent(aux, doc) if parent and (parent.tag_ == "VBN" - or any(child.tag_ == "VBN" for child in parent.children)): + or any(child.tag_ == "VBN" for child in parent.children)): # "'s" is "to have" aux_text = f"{aux.text}_" remove = [] @@ -331,16 +364,16 @@ def _negate_aux_in_doc( # version. if (contains_inversion and (aux.text.lower() == "am" or not prefer_contractions)): - # Find the closest pronoun to the right of the aux and add the negation + # Find closest pronoun to the right of the aux and add the negation # particle after it. pronoun = None - for tk in doc[aux.i + 1:]: + for tk in doc[aux.i+1:]: if self._is_pronoun(tk): pronoun = tk break if pronoun is None: self._handle_unsupported(fail=fail_on_unsupported) - add = {pronoun.i + 1: Token(text="not")} + add = {pronoun.i+1: Token(text="not")} else: # No inversion or contracted inversion. remove.append(aux.i) add = { @@ -354,11 +387,11 @@ def _negate_aux_in_doc( ) } # Handle e.g., "should've" -> "shouldn't have" - if aux.i + 1 < len(doc) and doc[aux.i + 1].text.lower() == "'ve": - remove.append(aux.i + 1) - add[aux.i + 1] = Token( + if aux.i+1 < len(doc) and doc[aux.i+1].text.lower() == "'ve": + remove.append(aux.i+1) + add[aux.i+1] = Token( text=" have", - has_space_after=doc[aux.i + 1]._.has_space_after + has_space_after=doc[aux.i+1]._.has_space_after ) return self._compile_sentence( doc, @@ -366,10 +399,10 @@ def _negate_aux_in_doc( add_tokens=add ) - @staticmethod def _handle_ca_wo( - *aux_tokens: Optional[SpacyToken], - negation: SpacyToken + self, + *aux_tokens: Optional[SpacyToken], + negation: SpacyToken ) -> Tuple[Optional[List[int]], Optional[Dict[int, Token]]]: """Handle special cases ``"won't"`` and ``"can't"``. @@ -431,8 +464,272 @@ def _handle_ca_wo( return remove, add return None, None - @staticmethod - def _is_verb_to_do(verb: SpacyToken) -> bool: + def _parse(self, string_: str) -> SpacyDoc: + """Parse a string. + + This method cleans up the string and tokenizes it. The resulting + :obj:`SpacyDoc` object, also includes information about whitespaces to + facilitate de-tokenization later on. + + Args: + string_ (:obj:`str`): + The string to parse. + + Returns: + :obj:`SpacyDoc`: The string tokenized into a spaCy document. + """ + # Remove extra whitespaces and other non-printable chars. + string_ = self._remove_extra_whitespaces(string_) + # Tokenize. + doc = self.spacy_model(string_) + i = 0 # Used to determine whitespaces. + for tk in doc: + has_space_after: bool = ( + i+len(tk) < len(string_) and (string_[i+len(tk)] == " ") + ) + tk._.has_space_after = has_space_after + i += len(tk) + int(has_space_after) + return doc + + def _get_entry_point( + self, + doc: SpacyDoc, + contains_inversion: bool + ) -> Optional[SpacyToken]: + """Choose a suitable verb to attempt negating first, if any. + + Args: + doc (:obj:`SpacyDoc`): + The spaCy document in which to find the entry point. + contains_inversion (:obj:`bool`): + Whether the sentence contains an inversion or not. + + Returns: + :obj:`Optional[SpacyToken]`: The chosen entry point (verb), or + :obj:`None` if the sentence has no root, or contains no verbs. + """ + if contains_inversion: + entry_point = [tk for tk in doc + if self._is_aux(tk) or self._is_verb(tk)] + if entry_point: + return entry_point[0] + root = self._get_root(doc) + if root is None: # nothing we can do + return None + # If the root token is not an AUX or a VERB, look for an AUX or + # VERB in its children. + if not (self._is_aux(root) or self._is_verb(root)): + entry_point = None + if root.children: + entry_point = [tk for tk in root.children + if self._is_aux(tk) or self._is_verb(tk)] + # No AUX or VERB found in the root children -> Take the first + # AUX or VERB in the sentence, if any. + if not entry_point: + entry_point = [tk for tk in doc + if self._is_aux(tk) or self._is_verb(tk)] + return entry_point[0] if entry_point else None + return root + + def _get_root(self, doc: SpacyDoc) -> Optional[SpacyToken]: + """Get the root token in a spaCy document, if any. + + Args: + doc (:obj:`SpacyDoc`): + The spaCy document to get the root from. + + Returns: + :obj:`Optional[SpacyToken]`: The root token, or :obj:`None` if the + sentence has no root. + """ + root = [tk for tk in doc if tk.dep_ == "ROOT"] + return root[0] if root else None + + def _get_first_negation_particle( + self, + doc: SpacyDoc + ) -> Optional[SpacyToken]: + """Get the first negation particle in a document. + + Args: + doc (:obj:`SpacyDoc`): + The spaCy document containing the token. + Returns: + :obj:`Optional[SpacyToken]`: The first negation particle in the + sentence, or :obj:`None` if no such particle exists. + """ + negation = [tk for tk in doc if tk.dep == neg] + return negation[0] if negation else None + + def _get_negated_child( + self, + token: SpacyToken, + min_index: int = 0 + ) -> Optional[SpacyToken]: + """Get the negated child of a token, if any. + + Only the first negated child with an index equal or greater than + :param:`min_index` is returned. + + Args: + token (:obj:`SpacyToken`): + The spaCy token to get the negated child from. + min_index (:obj:`int`, defaults to ``0``): + The minimum index (inclusive) the negated child must have in + order to be returned. Useful to consider children on the left + or the right of the passed token. + + Returns: + :obj:`Optional[SpacyToken]`: The negated child of :param:`token`, or + :obj:`None` if no negated child was found. + """ + if not token: + return None + min_index = max(0, min_index) # prevent negative values + child = [child for child in token.children if child.dep == neg + and child.i >= min_index] + return child[0] if child else None + + def _get_aux_child( + self, + token: SpacyToken, + min_index: int = 0 + ) -> Optional[SpacyToken]: + """Get the child of a token that is an auxiliary verb, if any. + + Only the first child that is an auxiliary with an index equal or greater + than :param:`min_index` is returned. + + Args: + token (:obj:`SpacyToken`): + The spaCy token to get the auxiliary children from. + min_index (:obj:`int`, defaults to ``0``): + The minimum index (inclusive) the auxiliary child must have in + order to be returned. Useful to consider children on the left + or the right of the passed token. + + Returns: + :obj:`Optional[SpacyToken]`: The auxiliary child of :param:`token`, + or :obj:`None` if no auxiliary child was found. + """ + if not token: + return None + min_index = max(0, min_index) # prevent negative values + child = [child for child in token.children if self._is_aux(child) + and child.i >= min_index] + return child[0] if child else None + + def _get_first_aux_or_verb( + self, + doc: SpacyDoc + ) -> Optional[SpacyToken]: + """Get the first verb in a spaCy document. + + The verb can be an auxiliary or not. + + Args: + doc (:obj:`SpacyDoc`): + The spaCy document to get the first verb from. + + Returns: + :obj:`Optional[SpacyToken]`: The first verb in the document or + :obj:`None` if no verb was found. + """ + aux = [tk for tk in doc if self._is_aux(tk) or self._is_verb(tk)] + return aux[0] if aux else None + + def _get_parent( + self, + token: SpacyToken, + doc: SpacyDoc + ) -> Optional[SpacyToken]: + """Get the parent of a given token, if any. + + Args: + token (:obj:`SpacyToken`): + The spaCy token to get the parent of. + doc (:obj:`SpacyDoc`): + The spaCy document in which to find for the parent. + + Returns: + :obj:`Optional[SpacyToken]`: The parent of the token, or :obj:`None` + if the token has no parent. + """ + if not token: + return None + parent = [ + potential_parent + for potential_parent in doc + if token in potential_parent.children + ] + return parent[0] if parent else None + + def _is_aux(self, token: SpacyToken) -> bool: + """Determine whether a token is an auxiliary verb. + + Args: + token (:obj:`SpacyToken`): + The spaCy token to determine whether it is auxiliary. + + Returns: + :obj:`bool`: :obj:`True` if the token is an auxiliary verb, + otherwise :obj:`False`. + """ + if not token: + return False + return token.pos == AUX + + def _is_pronoun(self, token: SpacyToken) -> bool: + """Determine whether a token is a pronoun. + + Args: + token (:obj:`SpacyToken`): + The spaCy token to determine whether it is a pronoun. + + Returns: + :obj:`bool`: :obj:`True` if the token is a pronoun, + otherwise :obj:`False`. + """ + if not token: + return False + return token.pos == PRON + + def _is_noun(self, token: SpacyToken) -> bool: + """Determine whether a token is a noun. + + Args: + token (:obj:`SpacyToken`): + The spaCy token to determine whether it is a noun. + + Returns: + :obj:`bool`: :obj:`True` if the token is a noun, + otherwise :obj:`False`. + """ + if not token: + return False + return token.pos == NOUN + + def _is_verb(self, token: SpacyToken) -> bool: + """Determine whether a token is a non-auxiliary verb. + + .. note:: + + If you want to check if a token is either an auxiliary *or* a verb, + you can use this method in combination with :meth:`Negator._is_aux`. + + Args: + token (:obj:`SpacyToken`): + The spaCy token to determine whether it is a non-auxiliary verb. + + Returns: + :obj:`bool`: :obj:`True` if the token is a non-auxiliary verb, + otherwise :obj:`False`. + """ + if not token: + return False + return token.pos == VERB + + def _is_verb_to_do(self, verb: SpacyToken) -> bool: """Determine whether a verb is the verb "to do" (in any tense). Args: @@ -447,8 +744,7 @@ def _is_verb_to_do(verb: SpacyToken) -> bool: return False return getLemma(verb.text.lower(), "VERB")[0] == "do" - @staticmethod - def _is_verb_to_be(verb: SpacyToken) -> bool: + def _is_verb_to_be(self, verb: SpacyToken) -> bool: """Determine whether a verb is the verb "to be" (in any tense). Args: @@ -505,7 +801,7 @@ def _contains_inversion(self, doc: SpacyDoc) -> bool: # Only attend to pronouns that don't refer to a noun (i.e., those # which could act as subjects). if (self._is_pronoun(tk) - and not self._is_noun(self._get_parent(tk, doc))): + and not self._is_noun(self._get_parent(tk, doc))): pronoun = tk if aux and pronoun: break @@ -513,10 +809,81 @@ def _contains_inversion(self, doc: SpacyDoc) -> bool: return False return aux.i < pronoun.i + def _compile_sentence( + self, + doc: SpacyDoc, + remove_tokens: Optional[List[int]] = None, + add_tokens: Optional[Dict[int, Token]] = None + ) -> str: + """Process and de-tokenize a spaCy document back into a string. + + Args: + doc (:obj:`SpacyDoc`): + The spaCy document. + remove_tokens (:obj:`Optional[List[int]]`): + The indexes of the tokens to remove from the document, if any. + add_tokens (:obj:`Optional[Dict[int, Token]]`): + The tokens to add to the document, if any. These are specified + as a dictionary whose keys are the indexes in which to insert + the new tokens, which are the respective values. + + Returns: + :obj:`str`: The resulting, de-tokenized string including the + removal/addition of tokens, if any. + """ + if remove_tokens is None: + remove_tokens = [] + if add_tokens is None: + add_tokens = {} + else: + add_tokens = dict(sorted(add_tokens.items())) # sort by index + tokens = [Token(tk.text, tk._.has_space_after) for tk in doc] + for i in remove_tokens: + tokens[i] = Token(text="", has_space_after=False) + for count, item in enumerate(add_tokens.items()): + i, tk = item + tokens.insert(i+count, tk) + return self._capitalize_first_letter( + self._remove_extra_whitespaces( + "".join([f"{tk.text}{' '*int(tk.has_space_after)}" + for tk in tokens]) + ) + ) + + def _capitalize_first_letter(self, string_: str) -> str: + """Uppercase the first letter of a string. + + The capitalization of the rest of the string remains unchanged. + + Args: + string_ (:obj:`str`): + The string whose first letter to uppercase. + + Returns: + :obj:`str`: The string with its first letter uppercased. + """ + if not string_: + return "" + return f"{string_[0].upper()}{string_[1:]}" + + def _remove_extra_whitespaces(self, string_: str) -> str: + """Remove any duplicated whitespaces in a string. + + Args: + string_ (:obj:`str`): + The string in which to remove any extra whitespaces. + + Returns: + :obj:`str`: The string with one whitespace at most between words. + """ + if not string_: + return "" + return " ".join(string_.split()) + def _initialize_spacy_model( - self, - use_transformers: bool, - **kwargs + self, + use_transformers: bool, + **kwargs ) -> spacy.language.Language: """Initialize the spaCy model to be used by the Negator. @@ -545,7 +912,6 @@ def _initialize_spacy_model( :obj:`spacy.language.Language`: The loaded spaCy model, ready to use. """ - # See https://stackoverflow.com/a/25061573/14683209 # We don't want the messages coming from pip "polluting" stdout. @contextmanager @@ -582,6 +948,24 @@ def suppress_stdout(): model_module = importlib.import_module(module_name) return model_module.load(**kwargs) + def _handle_unsupported(self, fail: Optional[bool] = None): + """Handle behavior upon unsupported sentences. + + Args: + fail (:obj:`Optional[bool]`): + Whether to raise an exception with unsupported sentences or not. + Raises: + :obj:`RuntimeError`: If :arg:`fail_on_unsupported` is set to + :obj:`True`. + """ + if fail is None: + fail = self.fail_on_unsupported + if fail: + raise RuntimeError("sentence not supported") + else: + self.logger.warning("Sentence not supported. Output might be " + "arbitrary.") + def _initialize_aux_negations(self) -> None: """Define the auxiliary verbs and their negated form. diff --git a/negate/negators/supported_languages.py b/negate/negators/supported_languages.py new file mode 100644 index 0000000..53f41f3 --- /dev/null +++ b/negate/negators/supported_languages.py @@ -0,0 +1,41 @@ +"""Languages supported by Negate.""" + +from pathlib import Path +from typing import List + + +class Language: + """Currently available languages for negation. + + The values correspond to the ISO 639-1 language codes. See + `https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes`__. + """ + + _languages: List[str] = [ + f.name for f in Path(__file__).parent.iterdir() if f.is_dir() + if not f.name.startswith("__") + ] + + @classmethod + def is_supported(cls, lang: str) -> bool: + """Determine whether a string represents a supported language. + + Args: + lang (:obj:`str`): + The language to check support for. + + Returns: + :obj:`bool`: Whether the string represents a supported language or + not. + """ + return lang in cls._languages + + @classmethod + def get_supported(cls) -> List[str]: + """Get supported languages for negation. + + Returns: + :obj:`List[SupportedLanguage]`: All the currently supported + languages. + """ + return cls._languages diff --git a/negate/utils/__init__.py b/negate/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/negate/tokens.py b/negate/utils/tokens.py similarity index 100% rename from negate/tokens.py rename to negate/utils/tokens.py diff --git a/negate/version.py b/negate/version.py index 2db60e1..933da12 100644 --- a/negate/version.py +++ b/negate/version.py @@ -3,6 +3,6 @@ # Negate version. __version__ = "1.1.3" -# spaCy models version. +# spaCy models versions. EN_CORE_WEB_MD_VERSION: str = "3.7.0" EN_CORE_WEB_TRF_VERSION: str = "3.7.2" diff --git a/tests/conftest.py b/tests/conftest.py index 507b9d3..f484926 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,11 +24,15 @@ def pytest_generate_tests(metafunc): if not metafunc.config.getoption("use_cpu"): with suppress(ValueError, NotImplementedError): # `use_gpu` ignored if `use_transformers` is False. - negator_model = Negator(use_transformers=use_transformers, + negator_model = Negator(language="en", + use_transformers=use_transformers, use_gpu=True) # If GPU is unsupported, we fallback to CPU. negator_model.negate_sentence("I will now check GPU support!") else: negator_model = Negator( - use_transformers=use_transformers, use_gpu=False) + language="en", + use_transformers=use_transformers, + use_gpu=False + ) metafunc.parametrize("negator", [negator_model]) diff --git a/tests/test_negate.py b/tests/test_negate.py index 8d827a6..47a46e5 100644 --- a/tests/test_negate.py +++ b/tests/test_negate.py @@ -19,8 +19,10 @@ def test_aux_root_affirmative( output_sentence: str, prefer_contractions: bool ): - assert negator.negate_sentence( - input_sentence, prefer_contractions) == output_sentence + assert output_sentence in negator.negate_sentence( + input_sentence, + prefer_contractions=prefer_contractions + ) @pytest.mark.parametrize( @@ -33,8 +35,10 @@ def test_aux_root_negative( output_sentence: str, prefer_contractions: bool ): - assert negator.negate_sentence( - input_sentence, prefer_contractions) == output_sentence + assert output_sentence in negator.negate_sentence( + input_sentence, + prefer_contractions=prefer_contractions + ) @pytest.mark.parametrize( @@ -47,8 +51,10 @@ def test_aux_root_children_affirmative( output_sentence: str, prefer_contractions: bool ): - assert negator.negate_sentence( - input_sentence, prefer_contractions) == output_sentence + assert output_sentence in negator.negate_sentence( + input_sentence, + prefer_contractions=prefer_contractions + ) @pytest.mark.parametrize( @@ -61,8 +67,10 @@ def test_aux_root_children_negative( output_sentence: str, prefer_contractions: bool ): - assert negator.negate_sentence( - input_sentence, prefer_contractions) == output_sentence + assert output_sentence in negator.negate_sentence( + input_sentence, + prefer_contractions=prefer_contractions + ) @pytest.mark.parametrize( @@ -75,8 +83,10 @@ def test_general_verbs_affirmative( output_sentence: str, prefer_contractions: bool ): - assert negator.negate_sentence( - input_sentence, prefer_contractions) == output_sentence + assert output_sentence in negator.negate_sentence( + input_sentence, + prefer_contractions=prefer_contractions + ) @pytest.mark.parametrize( @@ -89,8 +99,10 @@ def test_general_verbs_negative( output_sentence: str, prefer_contractions: bool ): - assert negator.negate_sentence( - input_sentence, prefer_contractions) == output_sentence + assert output_sentence in negator.negate_sentence( + input_sentence, + prefer_contractions=prefer_contractions + ) @pytest.mark.parametrize( @@ -103,8 +115,10 @@ def test_inversions_affirmative( output_sentence: str, prefer_contractions: bool ): - assert negator.negate_sentence( - input_sentence, prefer_contractions) == output_sentence + assert output_sentence in negator.negate_sentence( + input_sentence, + prefer_contractions=prefer_contractions + ) @pytest.mark.parametrize( @@ -117,8 +131,10 @@ def test_inversions_negative( output_sentence: str, prefer_contractions: bool ): - assert negator.negate_sentence( - input_sentence, prefer_contractions) == output_sentence + assert output_sentence in negator.negate_sentence( + input_sentence, + prefer_contractions=prefer_contractions + ) @pytest.mark.parametrize( @@ -131,8 +147,10 @@ def test_misc( output_sentence: str, prefer_contractions: bool ): - assert negator.negate_sentence( - input_sentence, prefer_contractions) == output_sentence + assert output_sentence in negator.negate_sentence( + input_sentence, + prefer_contractions=prefer_contractions + ) @pytest.mark.xfail @pytest.mark.parametrize( @@ -145,5 +163,7 @@ def test_failing( output_sentence: str, prefer_contractions: bool ): - assert negator.negate_sentence( - input_sentence, prefer_contractions) == output_sentence + assert output_sentence in negator.negate_sentence( + input_sentence, + prefer_contractions=prefer_contractions + )