Skip to content

Commit

Permalink
Adding download.py with retries and checks and download data into files
Browse files Browse the repository at this point in the history
  • Loading branch information
johnyrufus committed Oct 12, 2017
1 parent f927c27 commit 7c246ff
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 0 deletions.
Empty file.
100 changes: 100 additions & 0 deletions project/datadownload/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
import time
import codecs
import logging
import traceback

from functools import partial
from calendar import monthrange

sys.path.append("../..")
import got3 as got


'''
Usage:
Odd months of 2009 for Obama in storage folder /nobackup/user
./download.py 2009 1,3,5,7,9,11 Obama /nobackup/user
Even months:
./download.py 2009 2,4,6,8,10,12 Obama /nobackup/user
'''
retries_count_check = 3000 # TODO: KEEP this at 3000
sleep_time = 300 # TODO: KEEP this at 300
num_retries = 100 # TODO: KEEP this at 100

def get_tweets(criteria, filename):

try:
file = codecs.open(filename, "w+", "utf-8")
write_tweets_partial = partial(write_tweets, file)
got.manager.TweetManager.getTweets(criteria, write_tweets_partial)
except Exception as e:
logging.error(traceback.format_exc())
finally:
file.close()


def get_tweets_with_retry(query, since, until, filename):
criteria = got.manager.TweetCriteria()
criteria.querySearch = query
criteria.since = since
criteria.until = until
#criteria.maxTweets = 2 # TODO: Comment this

is_done = False
retries = num_retries

while retries and not is_done:
print('Retries number: {}'.format(retries))
with open(filename, "w"):
pass
get_tweets(criteria, filename)
count = 0
with open(filename, 'rb') as f:
for _ in f:
count += 1
if count > retries_count_check:
is_done = True
break
retries -= 1


def write_tweets(file, tweets):
for t in tweets:
file.write(('\n{}'.format(t.text)))
file.flush()
print('Saving batch of {} tweets ...\n'.format(len(tweets)))


def download_tweets_for_range(args):
year = args[0]
months = args[1].split(',')
query = args[2]
base_dirname = args[3]

for month in months:
if len(month) == 1:
month = '0' + month
dirname = base_dirname + '/' + year + '_' + month + '_' + query
if not os.path.exists(dirname):
os.makedirs(dirname)
days = monthrange(int(year), int(month))
for day in range(1, days[1] + 1):

day_str = str(day) if len(str(day)) == 2 else '0' + str(day)
since = year + '-' + month + '-' + day_str
until = year + '-' + month + '-' + str(int(day_str) + 1)
filename = dirname + '/' + since + '_' + query + '.csv'
print('Working on file - {} \n'.format(filename))
get_tweets_with_retry(query, since, until, filename)
time.sleep(sleep_time)


if __name__ == '__main__':
download_tweets_for_range(sys.argv[1:])

0 comments on commit 7c246ff

Please sign in to comment.