-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess.py
132 lines (110 loc) · 5.37 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import sys
import os
import re
import operator
from stemmer import *
# Uniquename: mamolina
# Dictionary of all possible expansions
expands = {"i'll": ['i', 'will'], "'twas": ['it', 'was'], "she'll": ['she', 'will'], "why'd": ['why', 'did'], "don't": ['do', 'not'], "should've": ['should', 'have'], "didn't": ['did', 'not'], "they've": ['they', 'have'], "who'll": ['who', 'will'], "won't": ['would', 'not'], "we'd": ['we', 'would'], "couldn't": ['could', 'not'], "how'll": ['how', 'will'], "why's": ['why', 'is'], "you'd": ['you', 'would'], "doesn't": ['does', 'not'], "might've": ['might', 'have'], "how's": ['how', 'is'], "he's": ['he', 'is'], "when's": ['when', 'is'], "where'd": ['where', 'did'], "what'd": ['what', 'did'], "he'd": ['he', 'would'], "can't": ['can', 'not'], "how'd": ['how', 'did'], "there's": ['there', 'is'], "shouldn't": ['should', 'not'], "they'll": ['they', 'will'], "when'll": ['when', 'will'], "where'll": ['where', 'will'], "you're": ['you', 'are'], "we're": ['we', 'are'], "mightn't": ['might', 'not'], "i've": ['i', 'have'], "'tis": ['it', 'is'], "what's": ['what', 'is'], "who's": ['who', 'is'], "where's": ['where', 'is'], "they'd": ['they', 'would'], "ain't": ['is', 'not'], "you've": ['you', 'have'], "would've": ['would', 'have'], "that'll": ['that', 'will'], "aren't": ['are', 'not'], "who'd": ['who', 'would'], "he'll": ['he', 'will'], "must've": ['must', 'have'], "they're": ['they', 'are'], "we'll": ['we', 'will'], "why'll": ['why', 'will'], "weren't": ['were', 'not'], "wasn't": ['was', 'not'], "wouldn't": ['would', 'not'], "hasn't": ['has', 'not'], "she'd": ['she', 'would'], "you'll": ['you', 'will'], "i'd": ['i', 'would'], "could've": ['could', 'have'], "she's": ['she', 'is'], "i'm": ['i', 'am'], "when'd": ['when', 'did'], "mustn't": ['must', 'not'], "isn't": ['is', 'not'], "that's": ['that', 'is']}
# List of stopwords
stopWords = ['a', 'all', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'few', 'from', 'for', 'have', 'he', 'her', 'here', 'him', 'his', 'how', 'i', 'in', 'is', 'it', 'its', 'many', 'me', 'my', 'none', 'of', 'on', 'or', 'our', 'she', 'some', 'the', 'their', 'them', 'there', 'they', 'that', 'this', 'to', 'us', 'was', 'what', 'when', 'where', 'which', 'who', 'why', 'will', 'with', 'you', 'your']
months = ['january', 'jan.', 'feb.', 'february', 'march', 'mar.', 'april', 'apr.', 'may', 'june', 'july', 'august', 'aug.', 'sept.', 'september', 'oct.', 'october', 'nov.', 'november', 'dec.', 'december']
special_chars = ["", "+", "-", "=", "&", "%", ",", ".", "'"]
# Removes any SGML tags present
def removeSGML(text):
if '</' in text:
text = text.replace('</','')
if '<' in text:
text = text.replace('<','')
if '>' in text:
text = text.replace('>', '')
return text
# Returns true if the token has a number
def has_number(token):
return any(char.isdigit() for char in token)
# Returns true if the token is a number
def is_number(token):
return all(char.isdigit() for char in token)
# Removes parenthesis and slashes from the token
def remove_parenthesis_slash(token):
if "(" in token:
token = token.replace("(", "")
if ")" in token:
token = token.replace(")", "")
if "/" in token:
toekn = token.replace("/", "")
return token
# Tokenizes the contents of the document
def tokenizeText(text):
tokens = []
split_text = text.split()
is_date = False
month = ''
for item in split_text:
if item in expands:
for word in expands[item]:
tokens.append(word)
elif "'" in item:
h_pos = item.find("'")
tokens.append(item[:h_pos])
tokens.append(item[h_pos:])
elif "," in item and not has_number(item):
for each in item.split(","):
if (item[len(item) - 1] == "." and item.count(".") == 1 and len(each) > 1):
tokens.append(item[:len(item) - 1])
elif each != "." and each != "":
tokens.append(each)
elif item[len(item) - 1] == "," or (item[len(item) - 1] == "." and item.count(".") == 1) or item[len(item) - 1] == "?" or item[len(item) - 1] == "!":
tokens.append(item[:len(item) - 1])
elif item in months:
is_date = True
month = item
elif is_date and is_number(item):
tokens.append(month + ' ' + item)
is_date = False
elif item != "":
tokens.append(item)
tokens = filter(lambda x: x not in special_chars, tokens)
return tokens
def removeStopwords(tokens):
new_list = []
for item in tokens:
if item not in stopWords:
new_list.append(item)
return new_list
def stemWords(tokens):
stemmed_tokens = []
for index, elem in enumerate(tokens):
if all(each.isalpha() for each in elem):
each = PorterStemmer()
stemmed_tokens.append(each.stem(elem, 0, len(elem) - 1))
else:
stemmed_tokens.append(elem)
return stemmed_tokens
if __name__ == '__main__':
dirname = sys.argv[1]
words = {}
total = 0
for filename in os.listdir(dirname):
f = open(dirname + filename, 'r')
content = f.read().replace('\n', ' ')
content = content.lower()
content = removeSGML(content)
content = remove_parenthesis_slash(content)
tokens = tokenizeText(content)
tokens = removeStopwords(tokens)
stemmed_tokens = stemWords(tokens)
for token in stemmed_tokens:
if token not in words:
words[token] = 0
words[token] += 1
total += 1
print 'Words ' + str(total)
print 'Vocabulary ' + str(len(words))
sorted_words = sorted(words.items(), key=operator.itemgetter(1), reverse=True)
count = 0
for word in sorted_words:
if count == 50:
break
print word[0] + ' ' + str(word[1])
count += 1