-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathn-fold.py
161 lines (127 loc) · 5.45 KB
/
n-fold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from probCalc import probability as PB
from tagger import Tagger
import sys
import os.path
from random import random, seed
def corpusReader(corpus):
"""A function which parses a corpus and returns three lists. one with all the sentences, one with all the correct tags and a one with both"""
# Read the corpus and create two lists with all the data
with open(corpus, 'r') as corp:
sentences = [] # All the sentences
correctTags = [] # Correct tags
tagData = [] # Sentences with word and tags
# Initiate temporary data holders
words, tags, oneTag = [], [], []
# Read corpus
for line in corp:
# Sentences are marked by newlines
if line == '\n':
# Add correct tags to list
correctTags.append(tags)
# Add words to list of sentence
sentences.append(words)
# Add tagdata
tagData.append(oneTag)
# Reset words and tags
words, tags, oneTag = [], [], []
else:
# Add correct tags and words
words.append(line.split()[0])
tags.append(line.split()[1])
bothTags = (line.split()[0], line.split()[1])
oneTag.append(bothTags)
print(len(sentences))
return sentences, correctTags, tagData
# Breaks a list into n parts and returns a list of the parts
def dividList(alist, n, startSeed):
"""Divide a list into n equal parts (almost) and returns a list with each part as a list in the list"""
seed(startSeed)
# Create a table of tables with n internal tables to hold output data
data = [[] for _ in range(n)]
# Copy the input data to not corrupt the list
inData = alist.copy()
# While there is still data in the list
while len(inData) != 0:
for i in range(n):
try: # To escape issues if all values are removed
# Remove a sentence from inData
getSent = inData.pop(int(random() * len(inData)))
# Append to sublist i
data[i].append(getSent)
except:
pass
return data
def evaluate(n, corpus):
"""Runs the n-fold validation on a corpus"""
if n < 1:
n = 10
print("n was to low and has been set to 10\n")
# Get all the data
sentences, correctTags, tagData = corpusReader(corpus)
allCor = []
allIncor = []
for check in range(1, n+1):
# Divide all the data
divSent = dividList(sentences, n, check)
divTags = dividList(correctTags, n, check)
divTrain = dividList(tagData, n, check)
# To count the total of incorrect and correct tags
correctlyTagged = []
incorrectlyTagged = []
print("Check {} doing {}-fold on {}\n".format(check, n, corpus))
# For each part to evaluate
for i in range(0, n):
# Get the parts to train on
trainingParts = divTrain[:i] + divTrain[i+1:]
train = []
# They need to be formatted so that we can use the Tagger
for index in range(len(trainingParts)):
train.extend(trainingParts[index])
# Get the testing and evaluation data
testingData = divSent[i]
evaluationData = divTags[i]
# Do some training
uni, bi, tri, word = PB(train)
tagger = Tagger(uni, bi, tri, word)
# Reset counts
correctTagCount = 0
incorrectTagCount = 0
# Go through each sentence and tag it
for index in range(len(testingData)):
tagged = tagger.tagSentence(testingData[index])
for tag in range(len(tagged)):
# If correct
if evaluationData[index][tag] == tagged[tag]:
correctTagCount += 1
else:
incorrectTagCount += 1
# Print to let you know I haven't forgotten about you.
print("{}-fold was tagged {}% correctly.".format(i+1, round(correctTagCount / (correctTagCount + incorrectTagCount)*100,2 )))
# Save n-fold counts
correctlyTagged.append(correctTagCount)
incorrectlyTagged.append(incorrectTagCount)
allCor.extend(correctlyTagged)
allIncor.extend(incorrectlyTagged)
# Total in numbers..
print("\n{} out of {} was correctly tagged.".format(sum(correctlyTagged), sum(correctlyTagged) + sum(incorrectlyTagged)))
# .. and percentage
print("\nFor a total of {}% correctness.".format(round(sum(correctlyTagged) / (sum(correctlyTagged) + sum(incorrectlyTagged))*100, 2)))
# Total in numbers..
print("\n{} out of {} was correctly tagged.".format(sum(allCor), sum(allCor) + sum(allIncor)))
# .. and percentage
print("\nFor a total of {}% correctness.".format(round(sum(allCor) / (sum(allCor) + sum(allIncor))*100, 2)))
if __name__ == '__main__':
"""Run script with args corpus and amount of folds """
try: # Corpus to read from
corpus = sys.argv[1]
except:
corpus = None
print("No corpus to evaluate")
try: # n-fold
n = int(sys.argv[2]) # If output file
except:
n = 10
if os.path.isfile(corpus):
evaluate(n, corpus)
else:
print("{} not found.".format(corpus))