diff --git a/project/datadownload/download.py b/project/datadownload/download.py deleted file mode 100755 index c1e28e93..00000000 --- a/project/datadownload/download.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import sys -import time -import codecs -import logging -import traceback - -from functools import partial -from calendar import monthrange - -sys.path.append("../..") -import got3 as got - - -''' -Usage: -Odd months of 2009 for Obama in storage folder /nobackup/user -./download.py 2009 1,3,5,7,9,11 Obama /nobackup/user - -Even months: -./download.py 2009 2,4,6,8,10,12 Obama /nobackup/user - -''' -retries_count_check = 3000 # TODO: KEEP this at 3000 -sleep_time = 300 # TODO: KEEP this at 300 -num_retries = 100 # TODO: KEEP this at 100 - -def get_tweets(criteria, filename): - - try: - file = codecs.open(filename, "w+", "utf-8") - write_tweets_partial = partial(write_tweets, file) - got.manager.TweetManager.getTweets(criteria, write_tweets_partial) - except Exception as e: - logging.error(traceback.format_exc()) - finally: - file.close() - - -def get_tweets_with_retry(query, since, until, filename): - criteria = got.manager.TweetCriteria() - criteria.querySearch = query - criteria.since = since - criteria.until = until - #criteria.maxTweets = 2 # TODO: Comment this - - is_done = False - retries = num_retries - - while retries and not is_done: - print('Retries number: {}'.format(retries)) - with open(filename, "w"): - pass - get_tweets(criteria, filename) - count = 0 - with open(filename, 'rb') as f: - for _ in f: - count += 1 - if count > retries_count_check: - is_done = True - break - retries -= 1 - - -def write_tweets(file, tweets): - for t in tweets: - file.write(('\n{}'.format(t.text))) - file.flush() - print('Saving batch of {} tweets ...\n'.format(len(tweets))) - - -def download_tweets_for_range(args): - year = args[0] - months = args[1].split(',') - query = args[2] - base_dirname = args[3] - - for month in months: - if len(month) == 1: - month = '0' + month - dirname = base_dirname + '/' + year + '_' + month + '_' + query - if not os.path.exists(dirname): - os.makedirs(dirname) - days = monthrange(int(year), int(month)) - for day in range(1, days[1] + 1): - - day_str = str(day) if len(str(day)) == 2 else '0' + str(day) - since = year + '-' + month + '-' + day_str - until = year + '-' + month + '-' + str(int(day_str) + 1) - filename = dirname + '/' + since + '_' + query + '.csv' - print('Working on file - {} \n'.format(filename)) - get_tweets_with_retry(query, since, until, filename) - time.sleep(sleep_time) - - -if __name__ == '__main__': - download_tweets_for_range(sys.argv[1:])