-
Notifications
You must be signed in to change notification settings - Fork 3
/
convert-flat-parses-to-pos-tags.py
38 lines (33 loc) · 1.2 KB
/
convert-flat-parses-to-pos-tags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
import time
import io
import sys
import nltk
import argparse
argParser = argparse.ArgumentParser()
argParser.add_argument("-i", help="input flat parses (depth = 1). one sentence per line.")
argParser.add_argument("-ot", help="output tokens. one sentence per line.")
argParser.add_argument("-op", help="output part-of-speech tags. one sentence per line.")
args = argParser.parse_args()
inputFile = io.open(args.i, encoding='utf8', mode='r')
outputTokensFile = io.open(args.ot, encoding='utf8', mode='w')
outputTagsFile = io.open(args.op, encoding='utf8', mode='w')
lines_counter = 0
for line in inputFile:
lines_counter += 1
if not line.startswith(u'(TOP (S ('):
print u'WARNING: skipping line #', lines_counter, ' which does not start with the prefix "(TOP (S (":'
print line
continue
line = line[9:-6]
token_tag_pairs = line.split(u') (')
tokens, tags = [], []
for token_tag_pair in token_tag_pairs:
token, tag = token_tag_pair.split(u' ')
tokens.append(token)
tags.append(tag)
outputTokensFile.write(u'{}\n'.format(u' '.join(tokens)))
outputTagsFile.write(u'{}\n'.format(u' '.join(tags)))
inputFile.close()
outputTokensFile.close()
outputTagsFile.close()