-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First version of MWE matching with no special syntax #24
- Loading branch information
Showing
4 changed files
with
358 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,11 @@ | ||
mwe_template semantic_tags | ||
North_noun East_noun London_noun Z1 | ||
East_noun London_noun Z2 | ||
east_noun london_noun Z4 | ||
south_noun london_noun Z3 | ||
west_noun london_noun Z5 | ||
east_noun lancaster_noun Z1 | ||
North_noun East_noun London_noun Z1 | ||
All_adv well_adj A1 | ||
all_adv well_adj A4 | ||
aall_adv well_adj A2 | ||
test_noun test_noun A2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,22 @@ | ||
[ | ||
{"token": "East", "lemma": "south", "pos": "noun", "usas": "Z2", "mwe_id": 1}, | ||
{"token": "London", "lemma": "london", "pos": "noun", "usas": "Z2", "mwe_id": 1}, | ||
{"token": "east", "lemma": "east", "pos": "noun", "usas": "", "mwe_id": 0}, | ||
{"token": "Eastt", "lemma": "east", "pos": "noun", "usas": "", "mwe_id": 0}, | ||
{"token": "west", "lemma": "South", "pos": "noun", "usas": "Z5", "mwe_id": 2}, | ||
{"token": "london", "lemma": "london", "pos": "noun", "usas": "Z5", "mwe_id": 2}, | ||
{"token": "another", "lemma": "ano", "pos": "det", "usas": "", "mwe_id": 0}, | ||
{"token": "North", "lemma": "north", "pos": "noun", "usas": "Z1", "mwe_id": 3}, | ||
{"token": "East", "lemma": "east", "pos": "noun", "usas": "Z1", "mwe_id": 3}, | ||
{"token": "London", "lemma": "london", "pos": "noun", "usas": "Z1", "mwe_id": 3}, | ||
{"token": "other", "lemma": "ano", "pos": "det", "usas": "", "mwe_id": 0}, | ||
{"token": "easst", "lemma": "East", "pos": "noun", "usas": "Z1", "mwe_id": 4}, | ||
{"token": "lonndon", "lemma": "Lancaster", "pos": "noun", "usas": "Z1", "mwe_id": 4}, | ||
{"token": "something", "lemma": "some", "pos": "det", "usas": "", "mwe_id": 0}, | ||
{"token": "Objective", "lemma": "can", "pos": "obj", "usas": "", "mwe_id": 0}, | ||
{"token": "AAll", "lemma": "All", "pos": "adv", "usas": "A1", "mwe_id": 5}, | ||
{"token": "well", "lemma": "well", "pos": "adj", "usas": "A1", "mwe_id": 5}, | ||
{"token": "two", "lemma": "tw", "pos": "num", "usas": "", "mwe_id": 0} | ||
{"token": "East", "lemma": "south", "pos": "noun", "usas": "Z2", "mwe_id": 2}, | ||
{"token": "London", "lemma": "london", "pos": "noun", "usas": "Z2", "mwe_id": 2}, | ||
{"token": "east", "lemma": "east", "pos": "noun", "usas": "Z99", "mwe_id": 0}, | ||
{"token": "Eastt", "lemma": "east", "pos": "noun", "usas": "Z99", "mwe_id": 0}, | ||
{"token": "west", "lemma": "South", "pos": "noun", "usas": "Z3", "mwe_id": 3}, | ||
{"token": "london", "lemma": "london", "pos": "noun", "usas": "Z3", "mwe_id": 3}, | ||
{"token": "another", "lemma": "ano", "pos": "det", "usas": "Z99", "mwe_id": 0}, | ||
{"token": "North", "lemma": "north", "pos": "noun", "usas": "Z1", "mwe_id": 1}, | ||
{"token": "East", "lemma": "east", "pos": "noun", "usas": "Z1", "mwe_id": 1}, | ||
{"token": "London", "lemma": "london", "pos": "noun", "usas": "Z1", "mwe_id": 1}, | ||
{"token": "other", "lemma": "ano", "pos": "det", "usas": "Z99", "mwe_id": 0}, | ||
{"token": "easst", "lemma": "East", "pos": "noun", "usas": "Z1", "mwe_id": 5}, | ||
{"token": "lonndon", "lemma": "Lancaster", "pos": "noun", "usas": "Z1", "mwe_id": 5}, | ||
{"token": "something", "lemma": "some", "pos": "det", "usas": "Z99", "mwe_id": 0}, | ||
{"token": "Objective", "lemma": "can", "pos": "obj", "usas": "Z99", "mwe_id": 0}, | ||
{"token": "AAll", "lemma": "All", "pos": "adv", "usas": "A1", "mwe_id": 6}, | ||
{"token": "well", "lemma": "well", "pos": "adj", "usas": "A1", "mwe_id": 6}, | ||
{"token": "two", "lemma": "tw", "pos": "num", "usas": "Z99", "mwe_id": 0}, | ||
{"token": "wesst", "lemma": "west", "pos": "noun", "usas": "Z5", "mwe_id": 4}, | ||
{"token": "london", "lemma": "london", "pos": "noun", "usas": "Z5", "mwe_id": 4} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
import json | ||
from pathlib import Path | ||
from typing import List, OrderedDict, Tuple | ||
import collections | ||
|
||
from pymusas.lexicon_collection import MWELexiconCollection | ||
from pymusas.taggers.rule_based import _tag_mwe | ||
|
||
|
||
DATA_DIR = Path(__file__, '..', '..', 'data').resolve() | ||
TAGGER_DATA_DIR = Path(DATA_DIR, 'taggers') | ||
|
||
BASIC_LEXICON = Path(TAGGER_DATA_DIR, 'mwe_basic_lexicon.tsv') | ||
BASIC_DATA = Path(TAGGER_DATA_DIR, 'rule_based_mwe_input_output.json') | ||
|
||
|
||
def generate_tag_test_data(test_data_file: Path, mwe_lexicon_file: Path | ||
) -> Tuple[List[str], | ||
List[str], | ||
List[str], | ||
OrderedDict[str, List[str]], | ||
List[List[str]], | ||
List[int]]: | ||
''' | ||
Given the test data stored at `test_data_file`, and | ||
the MWE lexicon at `mwe_lexicon_file`, it returns this data as a | ||
Tuple of length 6: | ||
1. A List of `tokens`, from the `test_data_file`. | ||
2. A List of `lemmas`, from the `test_data_file`. | ||
3. A List of `POS tags`, from the `test_data_file`. | ||
4. The MWE lexicon generated by parsing the `mwe_lexicon_file` to the | ||
`pymusas.lexicon_collection.MWELexiconCollection.from_tsv` method. | ||
5. A list of a list of expected semantic tags that should be generated based | ||
on the associated `token`, `lemma`, and `pos` from the first value of the tuple and | ||
the MWE lexicon data from the second tuple value. | ||
6. A list of expected `mwe_id` whereby each id defines which MWE a token belongs | ||
too, an id of 0 represents a token that is not part of an MWE. | ||
# Parameters | ||
test_data_file : `Path` | ||
A JSON file containing an Array of Objects. Each object must contain the | ||
following properties/keys: | ||
1. token, type str | ||
2. lemma, type str | ||
3. pos, type str | ||
4. usas, type str | ||
5. mwe_id, type int | ||
mwe_lexicon_file : `Path` | ||
A TSV file that can be converted into a :class:`pymusas.lexicon_collection.MWELexiconCollection` | ||
by using the class method :func:`pymusas.lexicon_collection.MWELexiconCollection.from_tsv` | ||
# Returns | ||
`Tuple[List[str], List[str], List[str], Dict[str, List[str]], List[List[str]], List[int]]` | ||
''' | ||
test_tokens: List[str] = [] | ||
test_lemmas: List[str] = [] | ||
test_pos_tags: List[str] = [] | ||
|
||
expected_usas_tags: List[List[str]] = [] | ||
expected_mwe_ids: List[int] = [] | ||
with test_data_file.open('r') as test_data_fp: | ||
for token_data in json.load(test_data_fp): | ||
test_tokens.append(token_data['token']) | ||
test_lemmas.append(token_data['lemma']) | ||
test_pos_tags.append(token_data['pos']) | ||
expected_usas_tags.append([token_data['usas']]) | ||
expected_mwe_ids.append(token_data['mwe_id']) | ||
|
||
lexicon_lookup = MWELexiconCollection.from_tsv(mwe_lexicon_file) | ||
|
||
return (test_tokens, test_lemmas, test_pos_tags, lexicon_lookup, | ||
expected_usas_tags, expected_mwe_ids) | ||
|
||
|
||
def test_tag_mwe_tokens__basic_rules() -> None: | ||
''' | ||
This tests the tag_mwe_tokens function with the basic rules that comes from | ||
using the `mwe_basic_lexicons` and `rules_based_mwe_basic_input_output` files. | ||
The basic rules for the MWE templates are: | ||
Starting with the longest n-gram templates assign semantic tags to tokens | ||
without semantic tags in the following order: | ||
1. Match on tokens and POS tags. | ||
2. Match on lemma and POS tags. | ||
3. Match on lower cased tokens and POS tags. | ||
4. Match on lower cased lemmas and POS tags. | ||
Then repeat this process for `n = n-1`. Stop when `n==2`, e.g. a | ||
MWE has to have at last 2 tokens. | ||
''' | ||
(tokens, lemmas, pos_tags, mwe_lexicon, | ||
expected_usas, expected_mwe_ids) = generate_tag_test_data(BASIC_DATA, BASIC_LEXICON) | ||
|
||
# Test that it returns all Z99 when we have no MWE rules | ||
empty_mwe_lexicon: OrderedDict = collections.OrderedDict() | ||
usas_tags, mwe_ids = _tag_mwe(tokens, lemmas, pos_tags, empty_mwe_lexicon) | ||
all_z99_tags = [['Z99'] for _ in tokens] | ||
all_0_ids = [0 for _ in tokens] | ||
assert all_z99_tags == usas_tags | ||
assert all_0_ids == mwe_ids | ||
|
||
# Test that it covers all of the non special syntax cases, e.g. all of the | ||
# cases that do not contain a wildcard or curly braces. | ||
usas_tags, mwe_ids = _tag_mwe(tokens, lemmas, pos_tags, mwe_lexicon) | ||
assert expected_usas == usas_tags | ||
assert expected_mwe_ids == mwe_ids | ||
|
||
|
||
|
||
|