-
Notifications
You must be signed in to change notification settings - Fork 3
/
xml_preprocess.py
35 lines (25 loc) · 991 Bytes
/
xml_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from __future__ import with_statement
from lxml import etree
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
fn = sys.argv[1]
output = sys.argv[2]
# node=etree.parse('data/'+fn)
# sentences = node.xpath("//mteval//doc//seg/text()")
# cleaned = []
# for s in sentences:
# cleaned.append([word_tokenize(s.replace("'", " ").replace('"',"")) for s in sent_tokenize(s.strip())][0])
# with open('data_cleaned/'+output, 'w') as f:
# for _list in cleaned:
# f.write(' '.join(_list).encode('utf-8') + '\n')
node = etree.parse('data/'+fn)
sentences = node.xpath("//doc//text()")
sentences = [s for s in sentences if len(s) > 2000]
for line in sentences[0].splitlines():
print(line)
cleaned_line = [word_tokenize(s) for s in line.split()]
if len(cleaned_line) > 0:
print(cleaned_line)
# print([word_tokenize(s) for s in sent_tokenize(line)])
#print([word_tokenize(s) for s in sent_tokenize('Thank you very much, I appreciate it.')])
#print(sentences[0])