forked from gdobler/cussac
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cussacTwitterCrawlTimed.py
134 lines (119 loc) · 5.49 KB
/
cussacTwitterCrawlTimed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/python
# -*- coding: utf-8 -*-
from TwitterSearch import *
from datetime import datetime,date,time
import pandas as pd
import time
import os
import yaml
from geoLocator import GeoLocator
def saveRecordsToCSV (records,outputFileIndex):
# Create a data frame from tweets saved in records
twitterFrame = pd.DataFrame.from_records(records, columns=['Tweet ID','Tweeted_At','Profile_Create_at','Username','Location','Enabled','Place','Geo','Text','Retweeted_Count'])
# Append tweets to CSV file
with open('csv_tweets' + str(outputFileIndex) + '.csv', 'a') as f:
header = False
if os.path.getsize('csv_tweets' + str(outputFileIndex) + '.csv') == 0:
header = True
twitterFrame.to_csv(f, header = header)
records = []
return records
def queryTwitter(records,outputFileIndex,totalRunTime,writeToFileTime, sleepTime):
n = GeoLocator()
req = 0
next_max_id = 0
startTime = time.time()
lastWriteTime = startTime
tso = None
ts = None
while time.time() - startTime < totalRunTime:
try:
now = time.time()
print 'Total running time: ' + str(now-startTime) + ' seconds'
# Check if it is time to write to file
if now-lastWriteTime>writeToFileTime:
print 'Writing to CSV ' + str(len(records)) + ' Tweets'
records = saveRecordsToCSV (records,outputFileIndex)
lastWriteTime = now
# Create new twitter search object
if tso == None:
tso = TwitterSearchOrder()
tso.setKeywords([''])
#tso.setLanguage('en')
tso.setCount(100)
tso.setIncludeEntities(False)
tso.setGeocode(40.69, -73.94, 1, km = False)
#tso.setUntil(datetime.date(2014, 03, 24))
ts = TwitterSearch(consumer_key='FqjFRT1OHl6xyIGoq9uXSA',
consumer_secret='KuhoVREmf7ngwjOse2JOLJOVXNCi2IVEzQZu2B8',
access_token='114454541-xcjy2sbl7Rr4oIaogsaBrlVL5H4CvcdvOSMy3MnR',
access_token_secret='yyBBOJhxgfw9pezZda2hWF94doONSd50y0JoylYjL3rmY', verify=False)
# Query the Twitter API
text_file = open('json_tweets' + str(outputFileIndex) + '.txt', 'a')
text_fileE = open('error_log.txt', 'a')
req += 1
print 'Request # ' + str(req)
response = ts.searchTweets(tso)
# check all tweets according to their ID
for tweet in response['content']['statuses']:
text_file.write(str(tweet))
text_file.write('\n')
tup = ()
tweet_id = tweet['id']
tup = tup + (tweet_id, )
tup = tup + (str(tweet['created_at']), )
tup = tup + (str(tweet['user']['created_at']), )
tup = tup + (str(tweet['user']['screen_name']), )
tup = tup + (str(tweet['user']['location'].encode('ascii', 'ignore')), )
tup = tup + (str(tweet['user']['geo_enabled']), )
tup = tup + (str(tweet['place']), )
tup = tup + (str(tweet['geo']), )
tup = tup + (str(tweet['text'].encode('ascii', 'ignore')), )
tup = tup + (str(tweet['retweet_count']), )
# Save only tweets with Geo within NYC or without geo at all
try:
geoObj = yaml.load(tup[7])
lat = geoObj["u'coordinates'"][0]
long = geoObj["u'coordinates'"][1]
if n.isNYC(lat,long):
records.append(tup)
except:
records.append(tup)
# current ID is lower than current next_max_id?
if tweet_id < next_max_id or next_max_id == 0:
next_max_id = tweet_id
next_max_id -= 1 # decrement to avoid seeing this tweet again
# set lowest ID as MaxID
tso.setMaxID(next_max_id)
print 'Number of Tweets in memory: ' + str(len(records))
# Sleep time was calculated in order to not exceed Twitter's limit = 180 requests per 15 min
print 'Sleeping...'
time.sleep(sleepTime)
except TwitterSearchException, e:
print e
if len(records) == 0:
next_max_id = 0
if text_file.closed:
pass
else:
text_file.close()
outputFileIndex = datetime.strftime(datetime.utcnow(), "%b_%d_%Y_%H_%M")
text_fileE.write(str(e))
text_fileE.write('\n')
text_fileE.close()
time.sleep(900)
# Set tso to None to create new Twitter search object
tso = None
def runCollectTweets (outputFileIndex,totalRunTime, writeToFileTime=300, sleepTime=4):
records = []
print 'Started querying...'
queryTwitter(records, outputFileIndex, totalRunTime, writeToFileTime, sleepTime)
print 'Last save to CSV'
records = saveRecordsToCSV(records,outputFileIndex)
print 'Number of Tweets in memory: ' + str(len(records))
print 'Done!'
def main():
i = datetime.strftime(datetime.utcnow(), "%b_%d_%Y_%H_%M")
runCollectTweets(i,totalRunTime=float('Inf') ,writeToFileTime=900)
if __name__ == '__main__':
main()