-
Notifications
You must be signed in to change notification settings - Fork 0
/
get-comments-chains-from-csv.py
123 lines (98 loc) · 3.51 KB
/
get-comments-chains-from-csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json, requests, csv
import datetime
def getThreadComments(subreddit, threadname, results):
#call the main api
baseurl = 'https://www.reddit.com/r/' + subreddit + '/comments/' + threadname + '/.json'
print baseurl
params = {}
r = requests.get(baseurl, params = params, headers = {'User-agent': 'chainBot 0.1'})
data = r.json()
replies = data[1]['data']['children']
for reply in replies:
#check if it contains a thread of replies
if reply['kind'] == 't1':
#get the author
author = reply['data']['author']
if 'prev_author' in reply:
#save it in the results
#print ' ', author,' > ', reply['prev_author']
#check if source exists
if reply['prev_author'] not in results:
results[reply['prev_author']] = {}
#check if target exists
if author not in results[reply['prev_author']]:
results[reply['prev_author']][author] = 0
#increase the edge value
results[reply['prev_author']][author] = results[reply['prev_author']][author] + 1
else:
#print ' root ', author
True
#check if the comment has childrens
if reply['data']['replies'] != "":
#get childrens, add to list.
for child in reply['data']['replies']['data']['children']:
child['prev_author'] = author
#if the object has kind: "t1" it means that there are comments
#if the object has kind: "more" it means that there are just codes
if child['kind'] == "t1":
replies.append(child)
#print ' added +1 to ', author
elif child['kind'] == "more":
print ' more comments'
print json.dumps(child, indent=4, sort_keys=True)
# still to understand how to get 'more' comments
# see https://www.reddit.com/r/redditdev/comments/67mdxm/how_to_use_apimorechildren/
# this is another example https://www.reddit.com/api/morechildren.json?api_type=json&link_id=t3_7fk5mw&children=dqcshyh
else:
print ' other kind'
print json.dumps(child, indent=4, sort_keys=True)
else:
#TODO do something with more
print 'more comments not parsed'
def getMoreComments(threadId, childId):
baseurl = 'https://www.reddit.com/api/morechildren.json?api_type=json'
params = {}
params['link_id'] = threadId
params['children'] = childId
r = requests.get(baseurl, params = params, headers = {'User-agent': 'chainBot 0.1'})
data = r.json()
#rebuild the tree out of the comments
#load ids from csv
threads = []
errors = []
with open('data/r-italy-posts-from-bigquery.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
headers = readCSV.next()
for row in readCSV:
threads.append(row[1])
#threads = getSubredditThreads('italy', 100)
print json.dumps(threads, indent=4, sort_keys=True)
edges = {}
count = 0
for thread in threads:
count = count + 1
print count, '/', len(threads)
print thread
try:
tdata = getThreadComments('italy', thread, edges)
except:
print 'ERRORE'
errors.append([thread])
#save results in csv
ofile = open('results.csv', "wb")
writer = csv.writer(ofile, delimiter='\t', quotechar='"')
writer.writerow(['Source','Target','Weight'])
for sourceName in edges:
source = edges[sourceName]
for targetName in source:
value = source[targetName]
print 'adding ', sourceName, targetName, value
if(value > 1):
writer.writerow([sourceName,targetName,value])
ofile = open('errors.csv', "wb")
writer = csv.writer(ofile, delimiter='\t', quotechar='"')
writer.writerow(['thread','error'])
#save errors in csv
for err in errors:
writer.writerow[err]
#print json.dumps(edges, indent=4, sort_keys=True)