-
Notifications
You must be signed in to change notification settings - Fork 20
/
PostParser.py
311 lines (278 loc) · 10.6 KB
/
PostParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
__author__ = 'rast'
import logging
from os import path, makedirs
import json
#from ThreadedDownload import ThreadedDownload # buggy like hell
from Download import download
from Api import call_api
from collections import defaultdict
import re
def make_dir(base_dir, name):
"""Make new dir into base dir, return concatenation"""
if path.exists(base_dir) and path.isdir(base_dir):
directory = path.join(base_dir, name)
if path.exists(directory) and path.isdir(directory):
#raise RuntimeError("Directory already exists: {}".format(directory))
return directory
else:
makedirs(directory)
return directory
else:
raise RuntimeError("Directory does not exist: {}".format(base_dir))
def escape(name):
"""Escape the filename"""
result = unicode(re.sub('[^+=\-()$!#%&,.\w\s]', '_', name, flags=re.UNICODE).strip())
#print("\t{}\n\t{}".format(name, result))
return result[:250]
class PostParser(object):
"""Parses given post into data lists (text, music, photos, info, etc.)
parse post - store useful data:
id (of the post)
to_id (always user?)
from_id (post author)
date (unix timestamp, convert to time)
text (unicode)
attachments: (multimedia!)
type (type name)
<type>:
...
comments: (obvious)
count
can_post (0|1)
likes: (people list)
count
user_likes (if user liked it)
can_like
can_publish
reposts: (people list)
count
user_reposted (0|1)
signer_id (if group, and if post is signed)
copy_owner_id (if repost, author's id)
copy_post_id (if repost, original post id)
copy_text (if repost, user's response)
"""
def __init__(self, base_dir, subdir, args):
"""Make directory for current user"""
self.directory = make_dir(base_dir, subdir)
self.args = args
def __call__(self, tpl, raw_data, json_stuff):
"""Process whole post into directory"""
keys = []
funcs = []
self.urls = []
self.prefix = tpl[0]
self.number = tpl[1]
ignore = ['id', 'to_id', 'from_id', 'date',
'likes', 'reposts', 'signer_id',
'copy_owner_id', 'copy_post_id', 'copy_post_date',
'copy_post_type', 'reply_count', 'post_type',
'post_source', 'online', 'attachment', 'copy_text',
'media', 'can_edit',
# comments fix
'uid', 'cid', 'reply_to_cid', 'reply_to_uid',
'reply_owner_id', 'reply_post_id',
]
for k in raw_data.keys():
if k in ignore:
continue
try:
f = getattr(self, k)
keys.append(k)
funcs.append(f)
except AttributeError:
logging.warning("Not implemented: {}".format(k))
logging.info("Saving: {} for {}".format(', '.join(keys), raw_data['id']))
self.post_directory = make_dir(self.directory, str(raw_data['id']))
self.save_raw(json_stuff)
for (f, k) in zip(funcs, keys):
f(k, raw_data)
if self.urls and not self.args.no_download:
download(self.urls,
self.post_directory,
)
def text(self, key, raw_data):
"""Save text of the note"""
text = raw_data['text']
users_text = raw_data['copy_text']
stuff = ''
if raw_data['copy_post_id'] == '': # user's post
if text == '':
return
else:
stuff = '<h1>Text:</h1>\n' + text
else: # repost
if text == '':
if users_text == '':
return
else:
stuff = '<h1>Text:</h1>\n' + users_text
else:
if users_text == '':
stuff = '<h1>Original text:</h1>\n' + text
else:
stuff = "<h1>User's text:</h1>\n" + users_text + \
'<h1>Original text:</h1>\n' + text
f_name = path.join(self.post_directory, 'text.html')
out_file = open(f_name, 'a+')
out_file.write(stuff.encode("utf-8"))
out_file.close()
def attachments(self, key, raw_data):
"""Save all attachments"""
f_args = []
funcs = []
for att in raw_data[key]:
t = att['type']
k = 'dl_' + t
try:
f = getattr(self, k)
f_args.append(att[t])
funcs.append(f)
except AttributeError:
logging.warning("Not implemented downloader: {}".format(t))
for (f, a) in zip(funcs, f_args):
f(a)
def comments(self, key, data):
"""Save all comments"""
count = data[key]['count']
if count == 0:
return
comments = [count, ]
for x in xrange(data[key]['count']):
(comment_data, json_stuff) = call_api("wall.getComments",
[("owner_id", self.args.id),
("post_id", data["id"]),
("sort", "asc"),
("offset", x),
("count", 1),
("preview_length", 0),
("need_likes", 1),
("v", 4.4),
], self.args)
comments.append(comment_data[1])
cdata = defaultdict(lambda: '', comment_data[1])
pp = PostParser(self.post_directory, 'comments', self.args)
pp(('comment to ',self.number), cdata, json_stuff)
json_data = json.dumps(comments, indent=4, ensure_ascii=False)
f_name = path.join(self.post_directory, 'comments.json')
out_file = open(f_name, 'a+')
out_file.write(json_data.encode('utf-8'))
out_file.close()
def save_raw(self, data):
"""Save raw post data"""
data = json.loads(data)
data = json.dumps(data, indent=4, ensure_ascii=False)
f_name = path.join(self.post_directory, 'raw.json')
out_file = open(f_name, 'a+')
out_file.write(data.encode('utf-8'))
out_file.close()
def save_url(self, url, name=None, subdir=''):
if name is not None:
name = escape(name)
self.urls.append((url, name, subdir))
f_name = path.join(self.post_directory, 'media_urls.txt')
out_file = open(f_name, 'a+')
out_file.write(url)
out_file.write('\n')
out_file.close()
def dl_photo(self, data):
"""Download a photo
vk is a bit crazy, it stores photo in a bunch of sizes:
src
src_small
src_big
src_xbig
src_xxbig
src_xxxbig
(and what else?)
"""
sizes = ['src_xxxbig', 'src_xxbig', 'src_xbig', 'src_big', 'src', 'src_small']
url = None
for s in sizes:
try:
url = data[s] # try to get biggest size
break
except KeyError:
pass
if url is None:
logging.error("Unable to get photo url!")
else:
self.save_url(url)
def dl_link(self, data):
"""Store links in a file"""
url = data['url']
f_name = path.join(self.post_directory, 'links.txt')
out_file = open(f_name, 'a+')
out_file.write(url)
out_file.write('\n')
out_file.close()
def dl_photos_list(self, data):
"""Download list of photos"""
for x in data:
self.dl_photo(x)
def dl_audio(self, data):
initial_data = data
aid = data["aid"]
owner = data["owner_id"]
request = "{}_{}".format(owner, aid)
(audio_data, json_stuff) = call_api("audio.getById", [("audios", request), ], self.args)
album = 'no_album'
try:
data = audio_data[0]
artist = data['artist'][:100]
title= data['title'][:100]
name = u"{} - {}.mp3".format(artist, title)
#album = data['album'] # API changed, no time to fix
#album = get_album_name(owner, album, self.args)
#album = escape(album)
make_dir(self.post_directory, album)
self.save_url(data["url"], name, album)
except IndexError: # deleted :(
logging.warning("Deleted track: {}".format(str(initial_data)))
return
# store lyrics if any
try:
lid = data["lyrics_id"]
except KeyError:
return
(lyrics_data, json_stuff) = call_api("audio.getLyrics", [("lyrics_id", lid), ], self.args)
text = lyrics_data["text"].encode('utf-8')
name = escape(name)
f_name = path.join(self.post_directory, album)
f_name = path.join(f_name, name+'.txt')
# escape!
out_file = open(f_name, 'a+')
out_file.write(text)
out_file.write('\n')
out_file.close()
"""Download video
There's a walkaround:
http://habrahabr.ru/sandbox/57173/
But this requires authorization as another app
def dl_video(self, data):
#print data
"""
def dl_doc(self, data):
"""Download document (GIFs, etc.)"""
url = data["url"]
name = data["title"]
name, ext = path.splitext(name)
name = name + '.' + data["ext"]
self.save_url(url, name)
def dl_note(self, data):
"""Download note, not comments"""
(note_data, json_stuff) = call_api("notes.getById", [
("owner_id", data["owner_id"]),
("nid", data["nid"]),
], self.args)
stuff = u"<h1>{title}</h1>\n{text}".format(**note_data)
ndir = make_dir(self.post_directory, 'note_'+note_data["id"])
f_name = path.join(ndir, 'text.html')
out_file = open(f_name, 'a+')
out_file.write(stuff.encode("utf-8"))
out_file.close()
ndata = json.dumps(note_data, indent=4, ensure_ascii=False)
f_name = path.join(ndir, 'raw.json')
out_file = open(f_name, 'a+')
out_file.write(ndata.encode("utf-8"))
out_file.close()