-
Notifications
You must be signed in to change notification settings - Fork 0
/
pos_bigram.py
executable file
·118 lines (95 loc) · 3.34 KB
/
pos_bigram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#from __future__ import division
import nltk
import pickle
# nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from numpy.random import choice
import numpy as np
bigram_p = {}
START_SYM = "<s>"
PERCENTAGE = 0.10
DTYPE_ERROR = "Dytpe does not exist."
# TODO: add a right skewed prob dist
def createDist(possible, dtype="uniform"):
if(dtype=="uniform"):
dist = []
for i in range(len(possible)):
dist.append(1.0/len(possible))
return dist
if(dtype=="right_skewed"):
total = 0
for it in possible:
total += it[1]
dist = []
for i in range(len(possible)):
dist.append(possible[i][1]/total)
return dist
else:
return DTYPE_ERROR
def bigramSort(listOfBigrams):
return sorted(listOfBigrams, key=lambda x: x[1], reverse=True)
def createListOfBigrams():
f = open("./data/annotated.txt", "r")
corpus = f.readlines()
for sentence in corpus:
tokens = sentence.split()
tokens = [START_SYM] + tokens
tokens_tag = pos_tag(tokens)
bigrams = (tuple(nltk.bigrams(tokens_tag)))
for bigram in bigrams:
if(bigram[0][0]=="<s>" or bigram[1][0]=="<s>"):
continue
if(bigram[0][0]=="(pause)" or bigram[1][0]=="(pause)" or \
bigram[0][0]=="(uh)" or bigram[1][0]=="(uh)" or \
bigram[0][0]=="(um)" or bigram[1][0]=="(um)"):
if bigram not in bigram_p:
# print(bigram)
bigram_p[bigram] = 1
else:
bigram_p[bigram] += 1
listOfBigrams = [(k, v) for k, v in bigram_p.items()]
return bigramSort(listOfBigrams)
def searchDraw(word, draw):
for it in draw:
if( (it[0][0][0] == word) or (it[0][1][0] == word) ):
return 1
return 0
def returnDraw(word, draw):
for it in draw:
if( (it[0][0][0] == word) or (it[0][1][0] == word) ):
return it[0]
def cleanInput(sent):
sent = sent.lower()
return sent.replace(".", "") \
.replace(",", "") \
.replace("\"", "")
def getPOS(sentence, listOfBigrams):
sentence = sentence.lower()
tokens = sentence.split()
tokens = pos_tag(tokens)
possibleBigrams = []
for token in tokens:
for j in range(len(listOfBigrams)):
if( (token == listOfBigrams[j][0][0]) or (token == listOfBigrams[j][0][1]) ):
possibleBigrams.append(listOfBigrams[j])
return bigramSort(possibleBigrams)
if __name__ == "__main__":
inputSentence = cleanInput(input("Input Sentence: "))
# bigrams = createListOfBigrams()
# outfile = open('./obj/pos_bigram', 'wb')
# pickle.dump(bigrams, outfile)
# outfile.close()
infile = open('./obj/pos_bigram', 'rb')
bigrams = pickle.load(infile)
infile.close()
choices = np.array(getPOS(inputSentence, bigrams))
print(choices)
draw = choices[choice(choices.shape[0], int(PERCENTAGE*(inputSentence.count(" ")+1)), p=createDist(choices, dtype="right_skewed"))]
print(draw)
for word in list(inputSentence.split()):
if(searchDraw(word, draw)==1):
tup = returnDraw(word, draw)
print(tup[0][0]+" ", tup[1][0]+" ", end="")
else:
print(word + " ", end="")
print()