-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
157 lines (148 loc) · 6.01 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import pickle
import re
from nltk import sent_tokenize, word_tokenize
from document_class import Document
from data import *
def extract_metadata(string):
'''
Pulls the metadata content out of a Helsinki corpus metadata tag.
'''
return " ".join(string.strip(">").split()[1:])
def char_replace(text):
'''Replace special character encodings with the actual characters.'''
text = text.replace("+A", "Æ") # capital ash Æ
text = text.replace("+a", "æ") # lowecase ash æ
text = text.replace("+D", "Ý") # capital eth Ý
text = text.replace("+d", "ð") # lowercase eth ð
text = text.replace("+G", "Ȝ") # capital yogh Ȝ
text = text.replace("+g", "ȝ") # lowercase yogh ȝ
text = text.replace("+TT", "Ꝥ") # capital crossed thorn Ꝥ
text = text.replace("+Tt", "Ꝥ") # capital crossed thorn Ꝥ
text = text.replace("+tt", "ꝥ") # lowercase crossed thorn ꝥ
text = text.replace("+T", "Þ") # capital thorn Þ
text = text.replace("+t", "þ") # lowercase thorn þ
text = text.replace("+L", "£") # pound sign £
text = text.replace("+e", "ę") # lowercase e caudata ę
return text
newline = re.compile(r"#\n") # finds line continuation chars
font = re.compile(r"\(\^(.*?)\^\)", re.DOTALL) # font other than basic font
foreign = re.compile(r"\(\\(.*?)\\\)", re.DOTALL) # foreign language
runes = re.compile(r"\(}(.*?)}\)", re.DOTALL) # runes
emendation = re.compile(r"\[{(.*?){\]", re.DOTALL) # emendations
editor = re.compile(r"\[\\.*?\\\]", re.DOTALL) # editor's comment
heading = re.compile(r"\[}(.*?)}\]", re.DOTALL) # heading
helsinki = re.compile(r"\[\^.*?\^\]", re.DOTALL) # helsinki corpus comments
empty_heading = re.compile(r"\[}\s*?}\]", re.DOTALL) # empty heading
def clean_text(text):
'''
Removes Helsiniki corpus markup from the document text and returns
the cleaned text.
'''
# clean newline markers
text = re.sub(newline, "\n", text)
# clean emendation markers
text = re.sub(emendation, "\g<1>", text)
# clean editor and corpus comments
text = re.sub(editor, "", text)
text = re.sub(helsinki, "", text)
# clean formatting markers
text = re.sub(font, "\g<1>", text)
text = re.sub(heading, "\g<1>", text)
# remove already empty headings
text = re.sub(empty_heading, "", text)
# remove foreign text and runes on no_foreign varieties
no_foreign = re.sub(foreign, "", text)
no_foreign = re.sub(runes, "", text)
# clean foreign text and rune markers
text = re.sub(foreign, "\g<1>", text)
text = re.sub(runes, "\g<1>", text)
# run appropriate things through char_replace
text = char_replace(text)
no_foreign = char_replace(no_foreign)
return text, no_foreign
train_docs = []
devtest_docs = []
test_docs = []
for file in os.listdir(CORPUS_DIR):
if file[0] != "c":
continue
full_text = open(CORPUS_DIR + file).read()
doc = Document(file)
raw_text = ""
for line in full_text.split("\n"):
if line.startswith("<N "):
name = extract_metadata(line)
# if we're on a new document in the same file
if raw_text.strip() != "" and name != doc.name:
# save raw text
doc.raw_text = raw_text
# clean text
cleaned, cleaned_foreign = clean_text(raw_text)
# replace newlines with a space
cleaned = re.sub(r"\n\n*", " ", cleaned)
cleaned_foreign = re.sub(r"\n\n*", " ", cleaned_foreign)
# save the cleaned forms
doc.cleaned_text = cleaned
doc.cleaned_text_no_foreign = cleaned_foreign
# save char lists
doc.chars = list(cleaned)
doc.chars_no_foreign = list(cleaned_foreign)
# add doc to list
if doc.filename in TRAIN:
train_docs.append(doc)
elif doc.filename in DEVTEST:
devtest_docs.append(doc)
else:
test_docs.append(doc)
# create new doc and reset raw text
doc = Document(file)
raw_text = ""
doc.name = name
elif line.startswith("<C "):
doc.corpus_section = extract_metadata(line)
elif line.startswith("<O "):
doc.o_date = extract_metadata(line)
elif line.startswith("<M "):
doc.m_date = extract_metadata(line)
elif line.startswith("<D "):
doc.dialect = extract_metadata(line)
elif line.startswith("<V "):
doc.style = extract_metadata(line)
elif line.startswith("<G "):
doc.translate_relation = extract_metadata(line)
elif line.startswith("<F "):
doc.original_language = extract_metadata(line)
elif line.startswith("<"):
continue
else:
raw_text += line + "\n"
# save final part of the document
# save raw text
doc.raw_text = raw_text
# clean text
cleaned, cleaned_foreign = clean_text(raw_text)
# replace newlines with a space
cleaned = re.sub(r"\n\n*", " ", cleaned)
cleaned_foreign = re.sub(r"\n\n*", " ", cleaned_foreign)
# save the cleaned forms
doc.cleaned_text = cleaned
doc.cleaned_text_no_foreign = cleaned_foreign
# save char lists
doc.chars = list(cleaned)
doc.chars_no_foreign = list(cleaned_foreign)
# add doc to list
if doc.filename in TRAIN:
train_docs.append(doc)
elif doc.filename in DEVTEST:
devtest_docs.append(doc)
else:
test_docs.append(doc)
os.makedirs("cached_data", exist_ok=True)
with open("cached_data/train_docs.pkl", 'wb') as f:
pickle.dump(train_docs, f)
with open("cached_data/devtest_docs.pkl", 'wb') as f:
pickle.dump(devtest_docs, f)
with open("cached_data/test_docs.pkl", 'wb') as f:
pickle.dump(test_docs, f)
print("Done!")