-
Notifications
You must be signed in to change notification settings - Fork 4
/
do_label.py
95 lines (69 loc) · 3.44 KB
/
do_label.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import twitter, os, time, sys, re, shutil
# Our shared modules
import common, definitions, filters
def update_class_file():
"""We store list of labelled tweets in *.cls files in the format
class label | message
e.g.
n | If I see one more back to school commercial I'm giving my eyes a paper cut.
y | i got lemon on my finger and it stings .-. stupid paper cut -.-
(Only the tweet text is stored here. The other tweet infornation
is not stored as we used only the tweet text for classification)
common.LATEST_CLASS_FILE (see common.py for actual name) is the
file where we keep our most up-to-list of labelled tweets
This function updates common.LATEST_CLASS_FILE using common.TWEETS_FILE,
which contains all the tweets that have undergone simple screening and
we have saved.
It also guesses labels for each new tweet using the latest classification
model.
(We track the tweet id of the latest tweet in common.LATEST_CLASS_FILE)
"""
# The lastest labelled tweet id (an integer) is stored as text in LATEST_CLASS_FILE
latest_labelled_tweet_id = int(file(common.LATEST_CLASS_FILE, 'rt').read().strip()) if os.path.exists(common.LATEST_CLASS_FILE) else 0
previous_tweet_id = latest_labelled_tweet_id
# Read the classification model. This will be used to guess tweet classifications
model = common.load_model()
# Read the tweets from TWEETS_FILE, label them and store them
# in labelled_messages
labelled_messages = []
fp = open(common.TWEETS_FILE, 'rt')
for line in fp:
line = line.strip(' \n')
# Skip empty lines
if not line:
continue
# Skip incorrectly formatted lines
try:
id_s,_,_,message = [pt.strip() for pt in line.split('|')]
id = int(id_s)
except ValueError:
print 'ValueError', line
continue
# Skip tweets we have already saved
if id <= latest_labelled_tweet_id:
continue
# Filter out messages that are not even allowed for training
if not filters.is_allowed_for_training(message):
continue
cls,_ = model.classify(message)
labelled_messages.append([definitions.AUTO_CLASSES_LABELS[cls], message])
latest_labelled_tweet_id = max(id, latest_labelled_tweet_id)
fp.close()
print 'found %d new tweets' % len(labelled_messages)
print 'before: latest_labelled_tweet_id=%d' % previous_tweet_id
print 'after: latest_labelled_tweet_id=%d' % latest_labelled_tweet_id
if latest_labelled_tweet_id == previous_tweet_id:
print 'Nothing to do'
exit()
# Save the current labelled data file
shutil.copyfile(common.CLASS_FILE, '%s.%d' % (common.CLASS_FILE, previous_tweet_id))
# Add the new entries to the labelled data file
fp = open(common.CLASS_FILE, 'at')
for i,t in enumerate(labelled_messages):
fp.write('%s | %s\n' % (t[0], t[1]))
fp.close()
print 'Added %d new labelled messages to %s' % (len(labelled_messages), common.CLASS_FILE)
# Update the latest labelled entry id
file(common.LATEST_CLASS_FILE, 'wt').write(str(latest_labelled_tweet_id))
if __name__ == '__main__':
update_class_file()