First version of MWE matching with no special syntax #24

UCREL · Jan 27, 2022 · bf83c87 · bf83c87
1 parent d48656a
commit bf83c87
Show file tree

Hide file tree

Showing 4 changed files with 358 additions and 20 deletions.
diff --git a/pymusas/taggers/rule_based.py b/pymusas/taggers/rule_based.py
@@ -1,4 +1,225 @@
-from typing import Dict, Iterable, Iterator, List, Optional, Tuple
+import re
+from typing import Dict, Iterable, Iterator, List, Optional, OrderedDict, Tuple
+
+
+def _tag_mwe(tokens: List[str], lemmas: List[str], pos_tags: List[str],
+             mwe_lexicon_lookup: OrderedDict[str, List[str]]
+             ) -> Tuple[List[List[str]], List[int]]:
+    '''
+    Given the tokens, lemmas, and POS tags for each word in a text along with a
+    Multi Word Expression lexicon lookup, it will return a `Tuple` of length 2
+    containing:
+    
+    1. `List` of USAS semantic tags for each token, whereby the most likely tag is the first tag
+    in the `List`. The `List` of tags returned are based on the MWE rules below.
+    2. `List` of ids, each id defines which MWE a token belongs too, an id of `0`
+    represents a token that is not part of an MWE.
+
+    # MWE Rules
+    
+    The MWE lexicon lookup contains a MWE template as it's key and a
+    `List` of semantic tags as it's value. Given this:
+
+    Starting with the longest n-gram templates assign semantic tags to tokens
+    in the following order:
+    
+    For each template of length *n*:
+        1. Match on tokens and POS tags.
+        2. Match on lemma and POS tags.
+        3. Match on lower cased tokens and POS tags.
+        4. Match on lower cased lemmas and POS tags.
+    
+    Then repeat this process for `n = n-1`. Stop when `n==2`, e.g. a
+    MWE has to have at last 2 tokens.
+
+    **Note** that the MWE rules may not cover all tokens, therefore for any
+    token not covered it will return the `Z99` semantic tag. For example
+    if the semantic tags returned from this function are:
+    `[[A1], [A1], [Z2, Z3], [Z2, Z3], [Z99]]` then the last token was not
+    covered by any of the MWE rules, hence why it returned `[Z99]`.
+    
+    # Parameters
+
+    tokens : `List[str]`
+        The tokens that are within the text.
+    lemmas : `List[str]`
+        The lemmas of the tokens.
+    pos_tags : `List[str]`
+        The Part Of Speech tags of the tokens.
+    mwe_lexicon_lookup : `OrderedDict[str, List[str]]`
+        A MWE lexicon lookup that contains MWE templates as keys and a `List` of
+        semantic tags as values. The Dictionary should be ordered based on the
+        n-gram of the templates, whereby the order should be largest value of
+        *n* first and smallest last. For example:
+        `collections.OrderedDict([('United_noun States_noun of_noun America_noun', 'Z2'),
+                                  ('United_noun States_noun','Z2')])`
+    
+    # Returns
+
+    `Tuple[List[List[str]], List[int]]`
+    '''
+
+    def create_mwe_template(text_iterable_1: Iterable[str],
+                            text_iterable_2: Iterable[str]) -> str:
+        '''
+        Given two iterables of Strings, will return a String
+        in the same format as the MWE templates:
+        
+        `{1_1}_{2_1} {1_2}_{2_2} {1_3}_{2_3}`
+        
+        Where `1_1` represents the first string in `text_iterable_1` and `2_1`
+        represents the first string in `text_iterable_2`. In this example we
+        assume the iterables are of length `3`.
+
+        # Parameters
+
+        text_iterable_1 : `Iterable[str]`
+            An iterable of Strings, typically this would be either tokens or
+            lemmas.
+        text_iterable_2 : `Iterable[str]`
+            An iterable of Strings, typically this would be the POS tags
+            associated to `text_iterable_1`.
+        
+        # Returns
+
+        `str`
+        '''
+        mwe_template_parts: List[str] = []
+        for text_1, text_2 in zip(text_iterable_1, text_iterable_2):
+            mwe_template_parts.append(f'{text_1}_{text_2}')
+        return ' '.join(mwe_template_parts)
+
+    def char_to_token_index(mwe_template: str,
+                            token_delimenter: str
+                            ) -> Dict[int, int]:
+        '''
+        Given an mwe template, will return dictionary of character index to
+        token index. **Note** we assume that the token delimenter is always a
+        single whitespace.
+
+        # Parameters
+
+        mwe_template : `str`
+            A MWE template.
+        token_delimenter : `str`
+            A string that determines a token within the `mwe_template`. At the
+            moment this has to be a single whitespace, e.g. ` `.
+
+        # Returns
+
+        `Dict[int, int]`
+        '''
+        char_to_token_mapping: Dict[int, int] = dict()
+        token_index = 0
+        for char_index, char in enumerate(mwe_template):
+            char_to_token_mapping[char_index] = token_index
+            if char == token_delimenter:
+                token_index += 1
+        return char_to_token_mapping
+
+    def find_and_tag_template(mwe_template: str, semantic_tags: List[str],
+                              text_in_mwe_template_format: str,
+                              text_mwe_semantic_tags: List[List[str]],
+                              char_to_token_mapping: Dict[int, int],
+                              mwe_ids: List[int],
+                              current_mwe_id: int) -> int:
+        '''
+        It searches for all occurrences
+        of the `mwe_template` in `text_in_mwe_template_format`, for each
+        occurrence it looks up the token index(s) through `char_to_token_mapping`
+        and updates the `text_mwe_semantic_tags` with the `semantic_tags` at the
+        given index(s). The `mwe_ids` are also updated in a similar manner based
+        off the `current_mwe_id`. If the `mwe_ids` for any of the token
+        index(s) contain an id greater than 0 then neither the
+        `text_mwe_semantic_tags` or `mwe_ids` will be modified as we assume
+        the semantic tags for that token have already been tagged correctly.
+
+        Returns the next avaliable MWE id.
+
+        **Note** this functions modifies `text_mwe_semantic_tags` and `mwe_ids`
+
+        # Parameters
+
+        mwe_template : `str`
+            A MWE template that has come from a MWE Lexicon
+        semantic_tags : `List[str]`
+            The semantic tags that are associated with the `mwe_template`
+        text_in_mwe_template_format : `str`
+            The tokens or lemmas, form the text to be tagged, that have been
+            combined with their associated POS tags to become an MWE template.
+            This allows the MWE template, `mwe_template`, from the MWE lexicon
+            to be searched within text.
+        text_mwe_semantic_tags : `List[List[str]]`
+            The semantic tags associated to each token in the text that is to be
+            tagged.
+        char_to_token_mapping : `Dict[int, int]`
+            character index to token index for the `text_in_mwe_template_format`
+        mwe_ids : `List[int]`
+            Each id defines which MWE a token belongs too, an id of `0`
+            represents a token that is not part of an MWE.
+        current_mwe_id : `int`
+            MWE id to tag the next token(s) with.
+
+        # Returns
+
+        `int`
+        '''
+        for match in re.finditer(mwe_template, text_in_mwe_template_format):
+            token_start = char_to_token_mapping[match.start()]
+            # match end is one index value beyond the find, hence the (- 1)
+            token_end = char_to_token_mapping[match.end() - 1]
+            if any(mwe_ids[token_start:token_end]):
+                continue
+            for token_index in range(token_start, token_end + 1):
+                mwe_ids[token_index] = current_mwe_id
+                text_mwe_semantic_tags[token_index] = semantic_tags
+            current_mwe_id += 1
+        return current_mwe_id
+
+    token_delimenter = ' '
+
+    token_pos = create_mwe_template(tokens, pos_tags)
+    token_pos_lower = token_pos.lower()
+    token_pos_index_mapping = char_to_token_index(token_pos, token_delimenter)
+
+    lemma_pos = create_mwe_template(lemmas, pos_tags)
+    lemma_pos_lower = lemma_pos.lower()
+    lemma_pos_index_mapping = char_to_token_index(lemma_pos, token_delimenter)
+
+    number_tokens = len(token_pos.split(token_delimenter))
+    mwe_semantic_tags = [['Z99'] for _ in range(number_tokens)]
+    mwe_ids = [0 for _ in range(number_tokens)]
+    current_mwe_id = 1
+    for mwe_template, semantic_tags in mwe_lexicon_lookup.items():
+
+        if mwe_template in token_pos:
+            current_mwe_id = find_and_tag_template(mwe_template, semantic_tags,
+                                                   token_pos,
+                                                   mwe_semantic_tags,
+                                                   token_pos_index_mapping,
+                                                   mwe_ids, current_mwe_id)
+
+        if mwe_template in lemma_pos:
+            current_mwe_id = find_and_tag_template(mwe_template, semantic_tags,
+                                                   lemma_pos,
+                                                   mwe_semantic_tags,
+                                                   lemma_pos_index_mapping,
+                                                   mwe_ids, current_mwe_id)
+
+        if mwe_template in token_pos_lower:
+            current_mwe_id = find_and_tag_template(mwe_template, semantic_tags,
+                                                   token_pos_lower,
+                                                   mwe_semantic_tags,
+                                                   token_pos_index_mapping,
+                                                   mwe_ids, current_mwe_id)
+
+        if mwe_template in lemma_pos_lower:
+            current_mwe_id = find_and_tag_template(mwe_template, semantic_tags,
+                                                   lemma_pos_lower,
+                                                   mwe_semantic_tags,
+                                                   lemma_pos_index_mapping,
+                                                   mwe_ids, current_mwe_id)
+    return mwe_semantic_tags, mwe_ids
 
 
 def _tag_token(text: str, lemma: str, pos: List[str],

diff --git a/tests/data/taggers/mwe_basic_lexicon.tsv b/tests/data/taggers/mwe_basic_lexicon.tsv
@@ -1,10 +1,11 @@
 mwe_template	semantic_tags
+North_noun East_noun London_noun	Z1
 East_noun London_noun	Z2
 east_noun london_noun	Z4
 south_noun london_noun	Z3
 west_noun london_noun	Z5
 east_noun lancaster_noun	Z1
-North_noun East_noun London_noun	Z1
 All_adv well_adj	A1
 all_adv well_adj	A4
 aall_adv well_adj	A2
+test_noun test_noun	A2
diff --git a/tests/data/taggers/rule_based_mwe_input_output.json b/tests/data/taggers/rule_based_mwe_input_output.json
@@ -1,20 +1,22 @@
 [
-    {"token": "East", "lemma": "south", "pos": "noun", "usas": "Z2", "mwe_id": 1},
-    {"token": "London", "lemma": "london", "pos": "noun", "usas": "Z2", "mwe_id": 1},
-    {"token": "east", "lemma": "east", "pos": "noun", "usas": "", "mwe_id": 0},
-    {"token": "Eastt", "lemma": "east", "pos": "noun", "usas": "", "mwe_id": 0},
-    {"token": "west", "lemma": "South", "pos": "noun", "usas": "Z5", "mwe_id": 2},
-    {"token": "london", "lemma": "london", "pos": "noun", "usas": "Z5", "mwe_id": 2},
-    {"token": "another", "lemma": "ano", "pos": "det", "usas": "", "mwe_id": 0},
-    {"token": "North", "lemma": "north", "pos": "noun", "usas": "Z1", "mwe_id": 3},
-    {"token": "East", "lemma": "east", "pos": "noun", "usas": "Z1", "mwe_id": 3},
-    {"token": "London", "lemma": "london", "pos": "noun", "usas": "Z1", "mwe_id": 3},
-    {"token": "other", "lemma": "ano", "pos": "det", "usas": "", "mwe_id": 0},
-    {"token": "easst", "lemma": "East", "pos": "noun", "usas": "Z1", "mwe_id": 4},
-    {"token": "lonndon", "lemma": "Lancaster", "pos": "noun", "usas": "Z1", "mwe_id": 4},
-    {"token": "something", "lemma": "some", "pos": "det", "usas": "", "mwe_id": 0},
-    {"token": "Objective", "lemma": "can", "pos": "obj", "usas": "", "mwe_id": 0},
-    {"token": "AAll", "lemma": "All", "pos": "adv", "usas": "A1", "mwe_id": 5},
-    {"token": "well", "lemma": "well", "pos": "adj", "usas": "A1", "mwe_id": 5},
-    {"token": "two", "lemma": "tw", "pos": "num", "usas": "", "mwe_id": 0}
+    {"token": "East", "lemma": "south", "pos": "noun", "usas": "Z2", "mwe_id": 2},
+    {"token": "London", "lemma": "london", "pos": "noun", "usas": "Z2", "mwe_id": 2},
+    {"token": "east", "lemma": "east", "pos": "noun", "usas": "Z99", "mwe_id": 0},
+    {"token": "Eastt", "lemma": "east", "pos": "noun", "usas": "Z99", "mwe_id": 0},
+    {"token": "west", "lemma": "South", "pos": "noun", "usas": "Z3", "mwe_id": 3},
+    {"token": "london", "lemma": "london", "pos": "noun", "usas": "Z3", "mwe_id": 3},
+    {"token": "another", "lemma": "ano", "pos": "det", "usas": "Z99", "mwe_id": 0},
+    {"token": "North", "lemma": "north", "pos": "noun", "usas": "Z1", "mwe_id": 1},
+    {"token": "East", "lemma": "east", "pos": "noun", "usas": "Z1", "mwe_id": 1},
+    {"token": "London", "lemma": "london", "pos": "noun", "usas": "Z1", "mwe_id": 1},
+    {"token": "other", "lemma": "ano", "pos": "det", "usas": "Z99", "mwe_id": 0},
+    {"token": "easst", "lemma": "East", "pos": "noun", "usas": "Z1", "mwe_id": 5},
+    {"token": "lonndon", "lemma": "Lancaster", "pos": "noun", "usas": "Z1", "mwe_id": 5},
+    {"token": "something", "lemma": "some", "pos": "det", "usas": "Z99", "mwe_id": 0},
+    {"token": "Objective", "lemma": "can", "pos": "obj", "usas": "Z99", "mwe_id": 0},
+    {"token": "AAll", "lemma": "All", "pos": "adv", "usas": "A1", "mwe_id": 6},
+    {"token": "well", "lemma": "well", "pos": "adj", "usas": "A1", "mwe_id": 6},
+    {"token": "two", "lemma": "tw", "pos": "num", "usas": "Z99", "mwe_id": 0},
+    {"token": "wesst", "lemma": "west", "pos": "noun", "usas": "Z5", "mwe_id": 4},
+    {"token": "london", "lemma": "london", "pos": "noun", "usas": "Z5", "mwe_id": 4}
 ]
diff --git a/tests/taggers/test_rule_based_mwe.py b/tests/taggers/test_rule_based_mwe.py
@@ -0,0 +1,114 @@
+import json
+from pathlib import Path
+from typing import List, OrderedDict, Tuple
+import collections
+
+from pymusas.lexicon_collection import MWELexiconCollection
+from pymusas.taggers.rule_based import _tag_mwe
+
+
+DATA_DIR = Path(__file__, '..', '..', 'data').resolve()
+TAGGER_DATA_DIR = Path(DATA_DIR, 'taggers')
+
+BASIC_LEXICON = Path(TAGGER_DATA_DIR, 'mwe_basic_lexicon.tsv')
+BASIC_DATA = Path(TAGGER_DATA_DIR, 'rule_based_mwe_input_output.json')
+
+
+def generate_tag_test_data(test_data_file: Path, mwe_lexicon_file: Path
+                           ) -> Tuple[List[str],
+                                      List[str],
+                                      List[str],
+                                      OrderedDict[str, List[str]],
+                                      List[List[str]],
+                                      List[int]]:
+    '''
+    Given the test data stored at `test_data_file`, and
+    the MWE lexicon at `mwe_lexicon_file`, it returns this data as a
+    Tuple of length 6:
+
+    1. A List of `tokens`, from the `test_data_file`.
+    2. A List of `lemmas`, from the `test_data_file`.
+    3. A List of `POS tags`, from the `test_data_file`.
+    4. The MWE lexicon generated by parsing the `mwe_lexicon_file` to the
+    `pymusas.lexicon_collection.MWELexiconCollection.from_tsv` method.
+    5. A list of a list of expected semantic tags that should be generated based
+    on the associated `token`, `lemma`, and `pos` from the first value of the tuple and
+    the MWE lexicon data from the second tuple value.
+    6. A list of expected `mwe_id` whereby each id defines which MWE a token belongs
+    too, an id of 0 represents a token that is not part of an MWE.
+
+    # Parameters
+
+    test_data_file : `Path`
+        A JSON file containing an Array of Objects. Each object must contain the
+        following properties/keys:
+        1. token, type str
+        2. lemma, type str
+        3. pos, type str
+        4. usas, type str
+        5. mwe_id, type int
+
+    mwe_lexicon_file : `Path`
+        A TSV file that can be converted into a :class:`pymusas.lexicon_collection.MWELexiconCollection`
+        by using the class method :func:`pymusas.lexicon_collection.MWELexiconCollection.from_tsv`
+    
+    # Returns
+
+    `Tuple[List[str], List[str], List[str], Dict[str, List[str]], List[List[str]], List[int]]`
+    '''
+    test_tokens: List[str] = []
+    test_lemmas: List[str] = []
+    test_pos_tags: List[str] = []
+
+    expected_usas_tags: List[List[str]] = []
+    expected_mwe_ids: List[int] = []
+    with test_data_file.open('r') as test_data_fp:
+        for token_data in json.load(test_data_fp):
+            test_tokens.append(token_data['token'])
+            test_lemmas.append(token_data['lemma'])
+            test_pos_tags.append(token_data['pos'])
+            expected_usas_tags.append([token_data['usas']])
+            expected_mwe_ids.append(token_data['mwe_id'])
+
+    lexicon_lookup = MWELexiconCollection.from_tsv(mwe_lexicon_file)
+
+    return (test_tokens, test_lemmas, test_pos_tags, lexicon_lookup,
+            expected_usas_tags, expected_mwe_ids)
+
+
+def test_tag_mwe_tokens__basic_rules() -> None:
+    '''
+    This tests the tag_mwe_tokens function with the basic rules that comes from
+    using the `mwe_basic_lexicons` and `rules_based_mwe_basic_input_output` files.
+
+    The basic rules for the MWE templates are:
+    
+    Starting with the longest n-gram templates assign semantic tags to tokens
+    without semantic tags in the following order:
+        1. Match on tokens and POS tags.
+        2. Match on lemma and POS tags.
+        3. Match on lower cased tokens and POS tags.
+        4. Match on lower cased lemmas and POS tags.
+    Then repeat this process for `n = n-1`. Stop when `n==2`, e.g. a
+    MWE has to have at last 2 tokens.
+    '''
+    (tokens, lemmas, pos_tags, mwe_lexicon,
+     expected_usas, expected_mwe_ids) = generate_tag_test_data(BASIC_DATA, BASIC_LEXICON)
+
+    # Test that it returns all Z99 when we have no MWE rules
+    empty_mwe_lexicon: OrderedDict = collections.OrderedDict()
+    usas_tags, mwe_ids = _tag_mwe(tokens, lemmas, pos_tags, empty_mwe_lexicon)
+    all_z99_tags = [['Z99'] for _ in tokens]
+    all_0_ids = [0 for _ in tokens]
+    assert all_z99_tags == usas_tags
+    assert all_0_ids == mwe_ids
+
+    # Test that it covers all of the non special syntax cases, e.g. all of the
+    # cases that do not contain a wildcard or curly braces.
+    usas_tags, mwe_ids = _tag_mwe(tokens, lemmas, pos_tags, mwe_lexicon)
+    assert expected_usas == usas_tags
+    assert expected_mwe_ids == mwe_ids
+
+
+
+