-
Notifications
You must be signed in to change notification settings - Fork 7
/
biglistgrab.py
145 lines (124 loc) · 4.26 KB
/
biglistgrab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import tweepy,sys, os, newt, argparse, datetime, csv, random,math
import networkx as nx
import newtx as nwx
import urllib, unicodedata
def checkDir(dirpath):
if not os.path.exists(dirpath):
os.makedirs(dirpath)
parser = argparse.ArgumentParser(description='Data about list members')
group = parser.add_mutually_exclusive_group()
group.add_argument('-list',help='Grab users from a list. Provide source as: username/listname')
group.add_argument('-users',nargs='*', help="A space separated list of usernames (without the @) for whom you want to do the grab.")
parser.add_argument('-sample',default=197,type=int,metavar='N',help='Sample the friends/followers (user, users); use 0 if you want all (users/users).')
parser.add_argument('-fname',default='',help='Custom folder name')
ORDEREDSAMPLE=1
args=parser.parse_args()
api=newt.getTwitterAPI()
def checkDir(dirpath):
if not os.path.exists(dirpath):
os.makedirs(dirpath)
def getUsersFromList(userList):
userList_l =userList.split('/')
user=userList_l[0]
list=userList_l[1]
tmp=newt.listDetailsByScreenName({},api.list_members,user,list)
u=[]
for i in tmp:
u.append(tmp[i].screen_name)
return u
sampleSize=args.sample
if args.fname!=None: fpath=str(args.fname)+'/'
else:fpath=''
now = datetime.datetime.now()
def outputter():
checkDir(fd)
print 'Writing file...',fn
writer=csv.writer(open(fn,'wb+'),quoting=csv.QUOTE_ALL)
writer.writerow([ 'source','screen_name','name','description','location','time_zone','created_at','contributors_enabled','url','listed_count','friends_count','followers_count','statuses_count','favourites_count','id_str','id','verified','utc_offset','profile_image_url','protected'])
twDetails={}
for u in twd:
twDetails[u.screen_name]=u
ux=[source]
for x in [u.screen_name,u.name,u.description,u.location,u.time_zone]:
if x != None:
ux.append(unicodedata.normalize('NFKD', unicode(x)).encode('ascii','ignore'))
else: ux.append('')
for x in [u.created_at,u.contributors_enabled,u.url,u.listed_count,u.friends_count,u.followers_count,u.statuses_count,u.favourites_count,u.id_str,u.id,u.verified,u.utc_offset,u.profile_image_url,u.protected]:
ux.append(x)
try:
writer.writerow(ux)
except: pass
twd=[]
twn=[]
if args.list!=None:
source=args.list.replace('/','_')
users=getUsersFromList(args.list)
fd='reports/'+fpath+args.list.replace('/','_')+'/'
fn=fd+'listTest_'+now.strftime("_%Y-%m-%d-%H-%M-%S")+'.csv'
print fn
for l in newt.chunks(users,100):
#print 'partial',l
tmp=api.lookup_users(screen_names=l)
for u in tmp:
twd.append(u)
twn.append(u.screen_name)
outputter()
elif args.users!=None:
for l in newt.chunks(args.users,100):
#print 'partial',l
tmp=api.lookup_users(screen_names=l)
for u in tmp:
twd.append(u)
twn.append(u.screen_name)
else: exit(-1)
for user in twn:
currSampleSize=sampleSize
source=user
twd=[]
fd='reports/'+fpath #+user+'/'
fn=fd+user+'_fo_'+str(sampleSize)+'_'+now.strftime("_%Y-%m-%d-%H-%M-%S")+'.csv'
print 'grabbing follower IDs for',user
try:
mi=tweepy.Cursor(api.followers_ids,id=user).items()
except:
continue
users=[]
try:
for m in mi: users.append(m)
except: continue
biglen=str(len(users))
print 'Number of followers:',biglen
#HACK
if str(len(users))>10000: currSampleSize=10000
#this breaks the date recreation on followers - need a run of 10000 users
if currSampleSize>0:
if len(users)>currSampleSize:
if ORDEREDSAMPLE !=1:
users=random.sample(users, currSampleSize)
print 'Using a random sample of '+str(currSampleSize)+' from '+str(biglen)
else:
#tmpsamp=int(len(users)/currSampleSize)
#need some way of getting 100 consecutive samples of 100 or so users?
print 'Using ordered sample of '+str(currSampleSize)+' from '+str(biglen)
ss=[]
offset=math.floor(len(users)/100)
for i in range(100):
randoff=random.randint(0, offset-100)
li=int(randoff+i*offset)
ui=int(li+100-1)
ss=ss+users[li:ui]
users=ss
else:
print 'Fewer members ('+str(len(users))+') than sample size: '+str(currSampleSize)
n=1
print 'Hundred batching'
for l in newt.chunks(users,100):
#print 'partial',l
print str(n)
n=n+1
try:
tmp=api.lookup_users(user_ids=l)
for u in tmp:twd.append(u)
except: continue
print '...done'
outputter()