Skip to content

Commit

Permalink
align_tokens.py: ADD_XPOS option
Browse files Browse the repository at this point in the history
  • Loading branch information
nschneid committed Jun 10, 2023
1 parent 3da8aa9 commit 8fba92e
Showing 1 changed file with 23 additions and 5 deletions.
28 changes: 23 additions & 5 deletions analysis/align_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,18 @@
used to check that the trees correspond.
"""

ADD_PUNCT_AND_SUBTOKS = True
ADD_PUNCT_AND_SUBTOKS = False
INFER_VAUX = False
INFER_LEMMA = True
INFER_LEMMA = False
ADD_XPOS = {'CD', 'MD', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'} # add XPOS tags in this list

with open('../datasets/twitter_ud.conllu') as f, open('../datasets/ewt-test_iaa50.conllu') as f2:
ud_trees = conllu.parse( #f.read() +
f2.read())
f.read())

cgel_trees = []
with open('../datasets/twitter.cgel') as f, open('../ewt-test_iaa50.adjudicated.cgel') as f2:
for tree in cgel.trees(f2):
with open('../datasets/twitter.cgel') as f, open('../datasets/ewt-test_iaa50.cgel') as f2:
for tree in cgel.trees(f):
cgel_trees.append(tree)

def ud_tok_scanner(ud_tree):
Expand Down Expand Up @@ -180,6 +181,23 @@ def insert_postpunct(cgel_node: Node, punct: str):
n.lemma = cgellemma
# if not explicitly set, the lemma defaults to the token form

if udn['xpos'] in ADD_XPOS:
if n.lemma==udn['lemma']:
if udn['xpos']=='CD':
if n.constituent in ('D', 'N'):
n.xpos = udn['xpos']
else:
print('Expected D or N:', n.lexeme, n.constituent, udn['xpos'], file=sys.stderr)
else:
if n.constituent in ('V', 'V_aux'):
n.xpos = udn['xpos']
else:
print('Expected V(_aux):', n.lexeme, n.constituent, udn['xpos'], file=sys.stderr)
else:
print('Unexpected lemma:', n.lexeme, n.lemma, n.constituent, udn['lemma'], udn['xpos'], file=sys.stderr)
elif n.constituent in ('V', 'V_aux'):
print('Missing xpos:', n.lexeme, file=sys.stderr)

#print(buf)
assert udn
if len(buf)==len(udn['form']): # or (buf,udn['form']) in {("if'","If"), ("of'","of")} | EWT_MISTRANSCRIPTIONS | EWT_SPELLING_CORRECTIONS_IN_CGEL:
Expand Down

1 comment on commit 8fba92e

@nschneid
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#88

Please sign in to comment.