-
Notifications
You must be signed in to change notification settings - Fork 7
/
prepare_ewtb_for_anno.py
executable file
·58 lines (50 loc) · 1.34 KB
/
prepare_ewtb_for_anno.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python2.7
'''
Preprocesses an ARK TweetNLP dataset for GFL annotation.
In the annotation section of each item, the punctuation tokens are removed.
If a word appears multiple times in a tweet,
the tokens are indexed with '~1', '~2', etc.
@author: Nathan Schneider ([email protected])
@since: 2013-02-05
'''
from __future__ import print_function
import os, sys, fileinput, re
from collections import Counter
inF = fileinput.input('ewtb55.idtokpos')
PUNCT = [',','.','-LRB-','-RRB-']
try:
while True:
ln = next(inF)[:-1]
itmid,toks,postagged = ln.split('\t')
print('---')
print('% ID',itmid)
print('% POS TEXT')
print(postagged)
tkns = []
tags = []
c = Counter()
for tokpos in postagged.split():
w, tag = tokpos[:tokpos.rindex('/')], tokpos[tokpos.rindex('/')+1:]
tkn = w
if tag not in PUNCT:
c[w] += 1
tkns.append(tkn)
tags.append(tag)
# subscripts for duplicate words
for w,n in c.items():
if n>1:
k = 1
for i,tkn in enumerate(tkns):
if tkn.strip()==w:
tkns[i] = tkn.replace(w, w+'~'+str(k))
k += 1
assert k==n+1,(w,n,tkns)
print()
print('% TEXT')
print(' '.join(tkns))
print('\n% ANNO\n')
print(' '.join((w if tag not in PUNCT else '\n') for w,tag in zip(tkns,tags)).replace('\n ','\n').replace('\n\n','\n'))
print()
print()
except StopIteration:
pass