Skip to content

Commit

Permalink
First version of MWE matching with no special syntax #24
Browse files Browse the repository at this point in the history
  • Loading branch information
apmoore1 committed Jan 27, 2022
1 parent d48656a commit bf83c87
Show file tree
Hide file tree
Showing 4 changed files with 358 additions and 20 deletions.
223 changes: 222 additions & 1 deletion pymusas/taggers/rule_based.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,225 @@
from typing import Dict, Iterable, Iterator, List, Optional, Tuple
import re
from typing import Dict, Iterable, Iterator, List, Optional, OrderedDict, Tuple


def _tag_mwe(tokens: List[str], lemmas: List[str], pos_tags: List[str],
mwe_lexicon_lookup: OrderedDict[str, List[str]]
) -> Tuple[List[List[str]], List[int]]:
'''
Given the tokens, lemmas, and POS tags for each word in a text along with a
Multi Word Expression lexicon lookup, it will return a `Tuple` of length 2
containing:
1. `List` of USAS semantic tags for each token, whereby the most likely tag is the first tag
in the `List`. The `List` of tags returned are based on the MWE rules below.
2. `List` of ids, each id defines which MWE a token belongs too, an id of `0`
represents a token that is not part of an MWE.
# MWE Rules
The MWE lexicon lookup contains a MWE template as it's key and a
`List` of semantic tags as it's value. Given this:
Starting with the longest n-gram templates assign semantic tags to tokens
in the following order:
For each template of length *n*:
1. Match on tokens and POS tags.
2. Match on lemma and POS tags.
3. Match on lower cased tokens and POS tags.
4. Match on lower cased lemmas and POS tags.
Then repeat this process for `n = n-1`. Stop when `n==2`, e.g. a
MWE has to have at last 2 tokens.
**Note** that the MWE rules may not cover all tokens, therefore for any
token not covered it will return the `Z99` semantic tag. For example
if the semantic tags returned from this function are:
`[[A1], [A1], [Z2, Z3], [Z2, Z3], [Z99]]` then the last token was not
covered by any of the MWE rules, hence why it returned `[Z99]`.
# Parameters
tokens : `List[str]`
The tokens that are within the text.
lemmas : `List[str]`
The lemmas of the tokens.
pos_tags : `List[str]`
The Part Of Speech tags of the tokens.
mwe_lexicon_lookup : `OrderedDict[str, List[str]]`
A MWE lexicon lookup that contains MWE templates as keys and a `List` of
semantic tags as values. The Dictionary should be ordered based on the
n-gram of the templates, whereby the order should be largest value of
*n* first and smallest last. For example:
`collections.OrderedDict([('United_noun States_noun of_noun America_noun', 'Z2'),
('United_noun States_noun','Z2')])`
# Returns
`Tuple[List[List[str]], List[int]]`
'''

def create_mwe_template(text_iterable_1: Iterable[str],
text_iterable_2: Iterable[str]) -> str:
'''
Given two iterables of Strings, will return a String
in the same format as the MWE templates:
`{1_1}_{2_1} {1_2}_{2_2} {1_3}_{2_3}`
Where `1_1` represents the first string in `text_iterable_1` and `2_1`
represents the first string in `text_iterable_2`. In this example we
assume the iterables are of length `3`.
# Parameters
text_iterable_1 : `Iterable[str]`
An iterable of Strings, typically this would be either tokens or
lemmas.
text_iterable_2 : `Iterable[str]`
An iterable of Strings, typically this would be the POS tags
associated to `text_iterable_1`.
# Returns
`str`
'''
mwe_template_parts: List[str] = []
for text_1, text_2 in zip(text_iterable_1, text_iterable_2):
mwe_template_parts.append(f'{text_1}_{text_2}')
return ' '.join(mwe_template_parts)

def char_to_token_index(mwe_template: str,
token_delimenter: str
) -> Dict[int, int]:
'''
Given an mwe template, will return dictionary of character index to
token index. **Note** we assume that the token delimenter is always a
single whitespace.
# Parameters
mwe_template : `str`
A MWE template.
token_delimenter : `str`
A string that determines a token within the `mwe_template`. At the
moment this has to be a single whitespace, e.g. ` `.
# Returns
`Dict[int, int]`
'''
char_to_token_mapping: Dict[int, int] = dict()
token_index = 0
for char_index, char in enumerate(mwe_template):
char_to_token_mapping[char_index] = token_index
if char == token_delimenter:
token_index += 1
return char_to_token_mapping

def find_and_tag_template(mwe_template: str, semantic_tags: List[str],
text_in_mwe_template_format: str,
text_mwe_semantic_tags: List[List[str]],
char_to_token_mapping: Dict[int, int],
mwe_ids: List[int],
current_mwe_id: int) -> int:
'''
It searches for all occurrences
of the `mwe_template` in `text_in_mwe_template_format`, for each
occurrence it looks up the token index(s) through `char_to_token_mapping`
and updates the `text_mwe_semantic_tags` with the `semantic_tags` at the
given index(s). The `mwe_ids` are also updated in a similar manner based
off the `current_mwe_id`. If the `mwe_ids` for any of the token
index(s) contain an id greater than 0 then neither the
`text_mwe_semantic_tags` or `mwe_ids` will be modified as we assume
the semantic tags for that token have already been tagged correctly.
Returns the next avaliable MWE id.
**Note** this functions modifies `text_mwe_semantic_tags` and `mwe_ids`
# Parameters
mwe_template : `str`
A MWE template that has come from a MWE Lexicon
semantic_tags : `List[str]`
The semantic tags that are associated with the `mwe_template`
text_in_mwe_template_format : `str`
The tokens or lemmas, form the text to be tagged, that have been
combined with their associated POS tags to become an MWE template.
This allows the MWE template, `mwe_template`, from the MWE lexicon
to be searched within text.
text_mwe_semantic_tags : `List[List[str]]`
The semantic tags associated to each token in the text that is to be
tagged.
char_to_token_mapping : `Dict[int, int]`
character index to token index for the `text_in_mwe_template_format`
mwe_ids : `List[int]`
Each id defines which MWE a token belongs too, an id of `0`
represents a token that is not part of an MWE.
current_mwe_id : `int`
MWE id to tag the next token(s) with.
# Returns
`int`
'''
for match in re.finditer(mwe_template, text_in_mwe_template_format):
token_start = char_to_token_mapping[match.start()]
# match end is one index value beyond the find, hence the (- 1)
token_end = char_to_token_mapping[match.end() - 1]
if any(mwe_ids[token_start:token_end]):
continue
for token_index in range(token_start, token_end + 1):
mwe_ids[token_index] = current_mwe_id
text_mwe_semantic_tags[token_index] = semantic_tags
current_mwe_id += 1
return current_mwe_id

token_delimenter = ' '

token_pos = create_mwe_template(tokens, pos_tags)
token_pos_lower = token_pos.lower()
token_pos_index_mapping = char_to_token_index(token_pos, token_delimenter)

lemma_pos = create_mwe_template(lemmas, pos_tags)
lemma_pos_lower = lemma_pos.lower()
lemma_pos_index_mapping = char_to_token_index(lemma_pos, token_delimenter)

number_tokens = len(token_pos.split(token_delimenter))
mwe_semantic_tags = [['Z99'] for _ in range(number_tokens)]
mwe_ids = [0 for _ in range(number_tokens)]
current_mwe_id = 1
for mwe_template, semantic_tags in mwe_lexicon_lookup.items():

if mwe_template in token_pos:
current_mwe_id = find_and_tag_template(mwe_template, semantic_tags,
token_pos,
mwe_semantic_tags,
token_pos_index_mapping,
mwe_ids, current_mwe_id)

if mwe_template in lemma_pos:
current_mwe_id = find_and_tag_template(mwe_template, semantic_tags,
lemma_pos,
mwe_semantic_tags,
lemma_pos_index_mapping,
mwe_ids, current_mwe_id)

if mwe_template in token_pos_lower:
current_mwe_id = find_and_tag_template(mwe_template, semantic_tags,
token_pos_lower,
mwe_semantic_tags,
token_pos_index_mapping,
mwe_ids, current_mwe_id)

if mwe_template in lemma_pos_lower:
current_mwe_id = find_and_tag_template(mwe_template, semantic_tags,
lemma_pos_lower,
mwe_semantic_tags,
lemma_pos_index_mapping,
mwe_ids, current_mwe_id)
return mwe_semantic_tags, mwe_ids


def _tag_token(text: str, lemma: str, pos: List[str],
Expand Down
3 changes: 2 additions & 1 deletion tests/data/taggers/mwe_basic_lexicon.tsv
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
mwe_template semantic_tags
North_noun East_noun London_noun Z1
East_noun London_noun Z2
east_noun london_noun Z4
south_noun london_noun Z3
west_noun london_noun Z5
east_noun lancaster_noun Z1
North_noun East_noun London_noun Z1
All_adv well_adj A1
all_adv well_adj A4
aall_adv well_adj A2
test_noun test_noun A2
38 changes: 20 additions & 18 deletions tests/data/taggers/rule_based_mwe_input_output.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
[
{"token": "East", "lemma": "south", "pos": "noun", "usas": "Z2", "mwe_id": 1},
{"token": "London", "lemma": "london", "pos": "noun", "usas": "Z2", "mwe_id": 1},
{"token": "east", "lemma": "east", "pos": "noun", "usas": "", "mwe_id": 0},
{"token": "Eastt", "lemma": "east", "pos": "noun", "usas": "", "mwe_id": 0},
{"token": "west", "lemma": "South", "pos": "noun", "usas": "Z5", "mwe_id": 2},
{"token": "london", "lemma": "london", "pos": "noun", "usas": "Z5", "mwe_id": 2},
{"token": "another", "lemma": "ano", "pos": "det", "usas": "", "mwe_id": 0},
{"token": "North", "lemma": "north", "pos": "noun", "usas": "Z1", "mwe_id": 3},
{"token": "East", "lemma": "east", "pos": "noun", "usas": "Z1", "mwe_id": 3},
{"token": "London", "lemma": "london", "pos": "noun", "usas": "Z1", "mwe_id": 3},
{"token": "other", "lemma": "ano", "pos": "det", "usas": "", "mwe_id": 0},
{"token": "easst", "lemma": "East", "pos": "noun", "usas": "Z1", "mwe_id": 4},
{"token": "lonndon", "lemma": "Lancaster", "pos": "noun", "usas": "Z1", "mwe_id": 4},
{"token": "something", "lemma": "some", "pos": "det", "usas": "", "mwe_id": 0},
{"token": "Objective", "lemma": "can", "pos": "obj", "usas": "", "mwe_id": 0},
{"token": "AAll", "lemma": "All", "pos": "adv", "usas": "A1", "mwe_id": 5},
{"token": "well", "lemma": "well", "pos": "adj", "usas": "A1", "mwe_id": 5},
{"token": "two", "lemma": "tw", "pos": "num", "usas": "", "mwe_id": 0}
{"token": "East", "lemma": "south", "pos": "noun", "usas": "Z2", "mwe_id": 2},
{"token": "London", "lemma": "london", "pos": "noun", "usas": "Z2", "mwe_id": 2},
{"token": "east", "lemma": "east", "pos": "noun", "usas": "Z99", "mwe_id": 0},
{"token": "Eastt", "lemma": "east", "pos": "noun", "usas": "Z99", "mwe_id": 0},
{"token": "west", "lemma": "South", "pos": "noun", "usas": "Z3", "mwe_id": 3},
{"token": "london", "lemma": "london", "pos": "noun", "usas": "Z3", "mwe_id": 3},
{"token": "another", "lemma": "ano", "pos": "det", "usas": "Z99", "mwe_id": 0},
{"token": "North", "lemma": "north", "pos": "noun", "usas": "Z1", "mwe_id": 1},
{"token": "East", "lemma": "east", "pos": "noun", "usas": "Z1", "mwe_id": 1},
{"token": "London", "lemma": "london", "pos": "noun", "usas": "Z1", "mwe_id": 1},
{"token": "other", "lemma": "ano", "pos": "det", "usas": "Z99", "mwe_id": 0},
{"token": "easst", "lemma": "East", "pos": "noun", "usas": "Z1", "mwe_id": 5},
{"token": "lonndon", "lemma": "Lancaster", "pos": "noun", "usas": "Z1", "mwe_id": 5},
{"token": "something", "lemma": "some", "pos": "det", "usas": "Z99", "mwe_id": 0},
{"token": "Objective", "lemma": "can", "pos": "obj", "usas": "Z99", "mwe_id": 0},
{"token": "AAll", "lemma": "All", "pos": "adv", "usas": "A1", "mwe_id": 6},
{"token": "well", "lemma": "well", "pos": "adj", "usas": "A1", "mwe_id": 6},
{"token": "two", "lemma": "tw", "pos": "num", "usas": "Z99", "mwe_id": 0},
{"token": "wesst", "lemma": "west", "pos": "noun", "usas": "Z5", "mwe_id": 4},
{"token": "london", "lemma": "london", "pos": "noun", "usas": "Z5", "mwe_id": 4}
]
114 changes: 114 additions & 0 deletions tests/taggers/test_rule_based_mwe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import json
from pathlib import Path
from typing import List, OrderedDict, Tuple
import collections

from pymusas.lexicon_collection import MWELexiconCollection
from pymusas.taggers.rule_based import _tag_mwe


DATA_DIR = Path(__file__, '..', '..', 'data').resolve()
TAGGER_DATA_DIR = Path(DATA_DIR, 'taggers')

BASIC_LEXICON = Path(TAGGER_DATA_DIR, 'mwe_basic_lexicon.tsv')
BASIC_DATA = Path(TAGGER_DATA_DIR, 'rule_based_mwe_input_output.json')


def generate_tag_test_data(test_data_file: Path, mwe_lexicon_file: Path
) -> Tuple[List[str],
List[str],
List[str],
OrderedDict[str, List[str]],
List[List[str]],
List[int]]:
'''
Given the test data stored at `test_data_file`, and
the MWE lexicon at `mwe_lexicon_file`, it returns this data as a
Tuple of length 6:
1. A List of `tokens`, from the `test_data_file`.
2. A List of `lemmas`, from the `test_data_file`.
3. A List of `POS tags`, from the `test_data_file`.
4. The MWE lexicon generated by parsing the `mwe_lexicon_file` to the
`pymusas.lexicon_collection.MWELexiconCollection.from_tsv` method.
5. A list of a list of expected semantic tags that should be generated based
on the associated `token`, `lemma`, and `pos` from the first value of the tuple and
the MWE lexicon data from the second tuple value.
6. A list of expected `mwe_id` whereby each id defines which MWE a token belongs
too, an id of 0 represents a token that is not part of an MWE.
# Parameters
test_data_file : `Path`
A JSON file containing an Array of Objects. Each object must contain the
following properties/keys:
1. token, type str
2. lemma, type str
3. pos, type str
4. usas, type str
5. mwe_id, type int
mwe_lexicon_file : `Path`
A TSV file that can be converted into a :class:`pymusas.lexicon_collection.MWELexiconCollection`
by using the class method :func:`pymusas.lexicon_collection.MWELexiconCollection.from_tsv`
# Returns
`Tuple[List[str], List[str], List[str], Dict[str, List[str]], List[List[str]], List[int]]`
'''
test_tokens: List[str] = []
test_lemmas: List[str] = []
test_pos_tags: List[str] = []

expected_usas_tags: List[List[str]] = []
expected_mwe_ids: List[int] = []
with test_data_file.open('r') as test_data_fp:
for token_data in json.load(test_data_fp):
test_tokens.append(token_data['token'])
test_lemmas.append(token_data['lemma'])
test_pos_tags.append(token_data['pos'])
expected_usas_tags.append([token_data['usas']])
expected_mwe_ids.append(token_data['mwe_id'])

lexicon_lookup = MWELexiconCollection.from_tsv(mwe_lexicon_file)

return (test_tokens, test_lemmas, test_pos_tags, lexicon_lookup,
expected_usas_tags, expected_mwe_ids)


def test_tag_mwe_tokens__basic_rules() -> None:
'''
This tests the tag_mwe_tokens function with the basic rules that comes from
using the `mwe_basic_lexicons` and `rules_based_mwe_basic_input_output` files.
The basic rules for the MWE templates are:
Starting with the longest n-gram templates assign semantic tags to tokens
without semantic tags in the following order:
1. Match on tokens and POS tags.
2. Match on lemma and POS tags.
3. Match on lower cased tokens and POS tags.
4. Match on lower cased lemmas and POS tags.
Then repeat this process for `n = n-1`. Stop when `n==2`, e.g. a
MWE has to have at last 2 tokens.
'''
(tokens, lemmas, pos_tags, mwe_lexicon,
expected_usas, expected_mwe_ids) = generate_tag_test_data(BASIC_DATA, BASIC_LEXICON)

# Test that it returns all Z99 when we have no MWE rules
empty_mwe_lexicon: OrderedDict = collections.OrderedDict()
usas_tags, mwe_ids = _tag_mwe(tokens, lemmas, pos_tags, empty_mwe_lexicon)
all_z99_tags = [['Z99'] for _ in tokens]
all_0_ids = [0 for _ in tokens]
assert all_z99_tags == usas_tags
assert all_0_ids == mwe_ids

# Test that it covers all of the non special syntax cases, e.g. all of the
# cases that do not contain a wildcard or curly braces.
usas_tags, mwe_ids = _tag_mwe(tokens, lemmas, pos_tags, mwe_lexicon)
assert expected_usas == usas_tags
assert expected_mwe_ids == mwe_ids




0 comments on commit bf83c87

Please sign in to comment.