-
Notifications
You must be signed in to change notification settings - Fork 0
/
ClassifierGen.py
137 lines (113 loc) · 4.78 KB
/
ClassifierGen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
'''
Created on 3 Jul 2014
@author: daniyar
'''
import pickle
from pymongo import MongoClient
from _collections import defaultdict
import re
from nltk.tokenize.regexp import wordpunct_tokenize
from nltk.classify.naivebayes import NaiveBayesClassifier
from time import sleep
from nltk.corpus import stopwords
import gc
db = {}
tweets_collection = {}
classifications_collection = {}
global_count = {}
def connect(db_name, tweets_collection_name, classification_collection_name):
global db, classifications_collection, tweets_collection
client = MongoClient('localhost', 27017)
db = client[db_name]
tweets_collection = db[tweets_collection_name]
classifications_collection = db[classification_collection_name]
def start():
global classifications_collection, tweets_collection, global_count
sw = stopwords.words('english')
thr = 5
refactored_tweets = {}
records = tweets_collection.find()
for record in records:
tweet = record['text']
tmp_classifiers = record['classifications']
for clasfId, classId in tmp_classifiers.iteritems():
if clasfId not in refactored_tweets.keys():
refactored_tweets[clasfId] = []
refactored_tweets[clasfId].append({'text': tweet, 'classId':classId})
records = None
gc.collect()
for classification in classifications_collection.find():
tweets = []
classification_name = classification['classification']
classification_id = str(classification["_id"])
classes = classification['classes']
#records = tweets_collection.find({"clasfId":classification_id})
records = []
try:
records = refactored_tweets[classification_id]
except KeyError:
print "No tweets for classification ", classification_name
continue
records_count = len(records)
print classification_name, records_count
if classification_id in global_count.keys():
if int(records_count/thr)>global_count[classification_id]:
print "Exceeded threshold. Training started"
for record in records:
tweet = record['text']
class_id = record['classId']
class_label = get_class_label(class_id, classes)
feats = features_from_tweet(tweet, class_label, word_indicator, stopwords=sw)
tweets.append(feats)
classifier = NaiveBayesClassifier.train(tweets)
f = open("%s.pickle"%classification_name, 'wb')
pickle.dump(classifier, f)
f.close()
global_count[classification_id] = int(records_count/thr)
else:
pass
else:
global_count[classification_id] = int(records_count/thr)
if global_count[classification_id] >=1:
print "New classification or just started monitor"
for record in records:
tweet = record['text']
class_id = record['classId']
class_label = get_class_label(class_id, classes)
feats = features_from_tweet(tweet, class_label, word_indicator, stopwords=sw)
tweets.append(feats)
classifier = NaiveBayesClassifier.train(tweets)
f = open("%s.pickle"%classification_name, 'wb')
pickle.dump(classifier, f)
f.close()
def get_class_label(_id, classes):
for _class in classes:
if str(_class['_id'])==_id:
return _class['name']
return None
def preprocess_tweet(_tweet):
#tweet = re.sub(r'(@[a-zA-Z0-9]+)|(http://[a-zA-Z0-9]*(.com|.ru|.org|.uk|.us|.net|.ly)+[/a-zA-Z0-9]*)', '', _tweet)
tweet = re.sub(r'(@[a-zA-Z0-9]+)|(http://[a-zA-Z0-9]*[.][a-zA-Z]+[/a-zA-Z0-9]*)|([".#]+)', '', _tweet)
return tweet
def word_indicator(tweet, **kwargs):
features = defaultdict(list)
tweet_words = get_tweet_words(tweet, **kwargs)
for w in tweet_words:
features[w] = True
return features
def get_tweet_words(_tweet, stopwords = []):
tweet = preprocess_tweet(_tweet)
user_set = set(["http", "://"])
tweet_words = set(wordpunct_tokenize(tweet.lower()))
tweet_words = tweet_words.difference(stopwords)
tweet_words = tweet_words.difference(user_set)
tweet_words = [w for w in tweet_words if len(w)>2]
return tweet_words
def features_from_tweet(tweet, label, extractor, **kwargs):
features = extractor(tweet, **kwargs)
return (features, label)
if __name__ == "__main__":
connect("classification", "tweets", "classifications")
while True:
start()
sleep(10)