-
Notifications
You must be signed in to change notification settings - Fork 807
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding download.py with retries and checks and download data into files
- Loading branch information
1 parent
f927c27
commit 7c246ff
Showing
2 changed files
with
100 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
import os | ||
import sys | ||
import time | ||
import codecs | ||
import logging | ||
import traceback | ||
|
||
from functools import partial | ||
from calendar import monthrange | ||
|
||
sys.path.append("../..") | ||
import got3 as got | ||
|
||
|
||
''' | ||
Usage: | ||
Odd months of 2009 for Obama in storage folder /nobackup/user | ||
./download.py 2009 1,3,5,7,9,11 Obama /nobackup/user | ||
Even months: | ||
./download.py 2009 2,4,6,8,10,12 Obama /nobackup/user | ||
''' | ||
retries_count_check = 3000 # TODO: KEEP this at 3000 | ||
sleep_time = 300 # TODO: KEEP this at 300 | ||
num_retries = 100 # TODO: KEEP this at 100 | ||
|
||
def get_tweets(criteria, filename): | ||
|
||
try: | ||
file = codecs.open(filename, "w+", "utf-8") | ||
write_tweets_partial = partial(write_tweets, file) | ||
got.manager.TweetManager.getTweets(criteria, write_tweets_partial) | ||
except Exception as e: | ||
logging.error(traceback.format_exc()) | ||
finally: | ||
file.close() | ||
|
||
|
||
def get_tweets_with_retry(query, since, until, filename): | ||
criteria = got.manager.TweetCriteria() | ||
criteria.querySearch = query | ||
criteria.since = since | ||
criteria.until = until | ||
#criteria.maxTweets = 2 # TODO: Comment this | ||
|
||
is_done = False | ||
retries = num_retries | ||
|
||
while retries and not is_done: | ||
print('Retries number: {}'.format(retries)) | ||
with open(filename, "w"): | ||
pass | ||
get_tweets(criteria, filename) | ||
count = 0 | ||
with open(filename, 'rb') as f: | ||
for _ in f: | ||
count += 1 | ||
if count > retries_count_check: | ||
is_done = True | ||
break | ||
retries -= 1 | ||
|
||
|
||
def write_tweets(file, tweets): | ||
for t in tweets: | ||
file.write(('\n{}'.format(t.text))) | ||
file.flush() | ||
print('Saving batch of {} tweets ...\n'.format(len(tweets))) | ||
|
||
|
||
def download_tweets_for_range(args): | ||
year = args[0] | ||
months = args[1].split(',') | ||
query = args[2] | ||
base_dirname = args[3] | ||
|
||
for month in months: | ||
if len(month) == 1: | ||
month = '0' + month | ||
dirname = base_dirname + '/' + year + '_' + month + '_' + query | ||
if not os.path.exists(dirname): | ||
os.makedirs(dirname) | ||
days = monthrange(int(year), int(month)) | ||
for day in range(1, days[1] + 1): | ||
|
||
day_str = str(day) if len(str(day)) == 2 else '0' + str(day) | ||
since = year + '-' + month + '-' + day_str | ||
until = year + '-' + month + '-' + str(int(day_str) + 1) | ||
filename = dirname + '/' + since + '_' + query + '.csv' | ||
print('Working on file - {} \n'.format(filename)) | ||
get_tweets_with_retry(query, since, until, filename) | ||
time.sleep(sleep_time) | ||
|
||
|
||
if __name__ == '__main__': | ||
download_tweets_for_range(sys.argv[1:]) |