-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy path100-47-network-15.py
640 lines (576 loc) · 24.3 KB
/
100-47-network-15.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
#coding=utf-8
"""
This script copies entries from a CSDN blog to an other weblog, using the MetaWeblog API.
It can move both posts and comments.
Require 'BeautifulSoup' module
Released under the GPL. Report bugs to [email protected]
Thanks for ordinary author Wei Wei(live space mover)
(C) Davelv, homepage http://www.davelv.net
(C) Wei Wei,homepage: http://www.broom9.com
General Public License: http://www.gnu.org/copyleft/gpl.html
Last modified 2012-01-03 06:21
"""
__VERSION__ = "1.0"
__PROGRAM__ = "CsdnBlogMover"
import sys
import os
import codecs
import httplib
import urllib2
import re
import logging
from datetime import datetime, timedelta
import time
from optparse import OptionParser
from string import Template
import pickle
from xml.sax import saxutils
import json
from BeautifulSoup import BeautifulSoup
class IDGenerator:
start = 0
current = 0
dict = {0: 0}
def __init__(self, start):
self.start = start
self.current = start
def GetID(self, key):
if not self.dict.has_key(key):
self.dict[key] = self.current
self.current += 1
return self.dict[key]
postIDGenerator = ''
commentIDGenerator = ''
csdnDatetimePattern = u'%Y-%m-%d %H:%M';
csdnHost = u'blog.csdn.net'
csdnCommentsPre = u''
http = httplib.HTTPConnection(csdnHost)
hlightdict = {"syntaxhighlight": u'<pre class="brush: \g<1>">\g<2></pre>',
"geshi": u'<pre lang="\g<1>">\g<2></pre>'}
def GetPage(url, retryTimes=5, retryIntvl=3):
global http
userAgent = {u'User-Agent': u'Fiddler',
u'Connection': u'keep-alive'}
while retryTimes > 0:
try:
logging.info("get url:" + url)
http.request("GET", url, headers=userAgent)
return http.getresponse().read().decode("utf8")
except httplib.CannotSendRequest:
logging.warning("Fetch data failure, reconnect after %ds", retryIntvl)
http.close()
except:
logging.warning("Fetch data failure, retry after %ds", retryIntvl)
finally:
retryTimes -= 1
if retryTimes == 0:
raise
time.sleep(retryIntvl)
def CheckAttachmentURL(url, attachEntrys):
for ae in attachEntrys:
if url == ae['url']: return False
return True
def ProcessAttachment(articleEntry, attachEntries=[]):
attachRe = re.compile(u'http://hi.csdn.net/attachment/[^"]+')
attachurls = attachRe.findall(articleEntry['content'])
for attachurl in attachurls:
if not CheckAttachmentURL(attachurl, attachEntries):
continue
attachEntry = {}
attachEntry['title'] = attachurl.split(u'/')[-1]
attachEntry['date'] = articleEntry['date']
attachEntry['url'] = attachurl
attachEntry['parentId'] = articleEntry['id']
attachEntry['id'] = postIDGenerator.GetID(attachurl)
attachEntry['metaKey'] = u"_wp_attached_file"
attachEntry['metaValue'] = attachEntry['title']
attachEntry['status'] = u"inherit"
attachEntry['type'] = u"attachment"
attachEntry['content'] = attachEntry['comments'] = attachEntry['category'] = u''
attachEntries.append(attachEntry)
return
def PrettyCode(content, hlight):
"""
Pretty code area in article content use pre to replace textarea
surpport SyntaxHighlighter & GeSHi
working...
"""
textarea = re.compile(u'<textarea.+?name="code".+?class="([^"]+)">(.+?)</textarea>', re.S)
return textarea.sub(hlightdict[hlight], content)
def PrettyComment(comment):
quote = re.compile(u'^\[quote=([^\]]+)\](.+)\[/quote\]', re.S)
comment = quote.sub(u'<fieldset><legend>引用 \g<1>:</legend>\g<2></fieldset>', comment)
reply = re.compile(u'\[reply\]([^\[]+)\[/reply\]')
return reply.sub(u'回复 \g<1>:', comment)
def ParseCommentDate(dateStr):
#"""
#Parse date string in comments
#examples:
#刚刚
#11分钟前
#11小时前
#昨天 11:11
#前天 11:11
#3天前 11:11
#2011-11-11 11:11
#"""
datetimeNow = datetime.today()
reg_method = {
u'\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}': lambda m: datetime.strptime(m.group(0), csdnDatetimePattern),
u'(\d)天前 (\d{1,2}):(\d{1,2})': lambda m: datetimeNow.replace(hour=int(m.group(2)),
minute=int(m.group(3))) - timedelta(
days=int(m.group(1))),
u'前天 (\d{1,2}):(\d{1,2})': lambda m: datetimeNow.replace(hour=int(m.group(1)),
minute=int(m.group(2))) - timedelta(days=2),
u'昨天 (\d{1,2}):(\d{1,2})': lambda m: datetimeNow.replace(hour=int(m.group(1)),
minute=int(m.group(2))) - timedelta(days=1),
u'(\d{1,2})小时前': lambda m: datetimeNow - timedelta(hours=int(m.group(1))),
u'(\d{1,2})分钟前': lambda m: datetimeNow - timedelta(minutes=int(m.group(1))),
u'刚刚': lambda m: datetimeNow}
for k, v in reg_method.items():
m = re.search(k, dateStr)
if m:
return v(m)
def FetchEntry(url, datetimePattern='%Y-%m-%d %H:%M', isPostOnly=False):
"""
Structure of entry
entry
|-title
|-manage
| |-category (maybe NULL)
| |-date
| |-view' counts
| |-comments' counts
|-content
|-permalLink (permalLink of previous entry, may be NULL)
|-comments
|-email
|-author
|-comment
|-date
"""
temp = url.split('/')
articleID = temp[-1]
logging.debug("Fetch article page from %s", url)
soup = BeautifulSoup(GetPage(url))
#logging.debug("Got Page Content\n---------------\n%s",soup.prettify())
item = {'title': '', 'date': '', 'content': '', 'category': [], 'prevLink': '', 'id': int(articleID),
'comments': [],
'parentId': 0, 'type': u'post', 'status': u'publish', 'metaKey': u'views', 'metaValue': 0}
#find article
article = soup.find(id="article_details")
if article:
logging.debug("Found article")
else:
logging.debug("Can't found article")
sys.exit(2)
#title
temp = article.find(attrs={"class": "article_title"}).find(attrs={"class": "link_title"}).find('a')
if temp:
item['title'] = u'' + temp.contents[0].string
logging.debug("Found title %s", item['title'])
else:
logging.warning("Can't find title")
sys.exit(2)
#category / date / view times / comments times
manage = article.find(attrs={"class": "article_manage"})
#category
temp = manage.find(attrs={"class": "link_categories"})
if temp:
item['category'] = map(lambda a: u'' + a.text, temp.findAll('a'))
categoryStr = u''
for cate in item['category']: categoryStr += cate + u', '
logging.debug("Found category %s", categoryStr[:-2])
#global categories
#categories.update(item['category'])
else:
logging.debug("No category, use default")
#date
temp = manage.find(attrs={"class": "link_postdate"})
if temp:
item['date'] = datetime.strptime(u'' + temp.contents[0].string, datetimePattern)
logging.debug("Found date %s", item['date'])
else:
logging.warning("Can't find date")
sys.exit(2)
#views
temp = manage.find(attrs={"class": "link_view"})
if temp:
item['metaValue'] = int(temp.contents[0][0:-3])
logging.debug("Found views count %d", item['metaValue'])
else:
logging.warning("Can't find views count")
sys.exit(2)
#comments count
temp = manage.find(attrs={"class": "link_comments"})
comments_cnt = 0
if temp:
comments_cnt = int(temp.contents[1][1:-1])
logging.debug("Found comments count %d", comments_cnt)
else:
logging.warning("Can't find comments count")
sys.exit(2)
#content
temp = article.find(id="article_content") or article.find(attrs={"class": "article_content"})
if temp:
item['content'] = u''.join(map(unicode, temp.contents))
logging.debug("Found content");
else:
logging.warning("Can't find content")
#previous entry link
temp = article.find('li', attrs={'class': 'prev_article'});
if temp:
item['prevLink'] = u'' + temp.find('a')['href']
logging.debug("Found previous permaLink %s", item['prevLink'])
#comments get from server
if isPostOnly or comments_cnt == 0:
return item
commentsURL = csdnCommentsPre + articleID
logging.debug("Fetch comments from %s", commentsURL)
page = GetPage(commentsURL)
#OMG, when I write out the parse functon by using regex
#I found it can be solved by json ulity in one line!!!
#{"list":[{"ArticleId":7079224,"BlogId":66847,"CommentId":2065153,"Content":"XXXX","ParentId":0,"PostTime":"昨天 11:26","Replies":null,"UserName":"evilhacker","Userface":"http://xxx.jpg"},...],...}
item['comments'] = json.loads(page)['list']
if item['comments'] == None:
logging.warning("Can't find conments")
for v in item['comments']:
uselessPriorities = ['ArticleId', 'BlogId', 'Replies', 'Userface']
for i in uselessPriorities: del v[i]
v['PostTime'] = ParseCommentDate(v['PostTime'])
return item
def FetchBlogInfo(url, needPermaLink=True):
global csdnCommentsPre
blogInfo = {}
logging.info("connectiong to web page %s", url)
body = GetPage(url)
soup = BeautifulSoup(body)
blogInfo['user'] = u'' + re.search(csdnHost + "/([^/]+)", url).group(1)
blogInfo['blogURL'] = u'http://' + csdnHost + '/' + blogInfo['user'] + '/'
csdnCommentsPre = blogInfo['blogURL'] + "comment/list/"
logging.info('Blog URL is %s', blogInfo['blogURL'])
blogInfo['nowTime'] = u'' + datetime.now().strftime('%a, %d %b %Y %H:%M:%S +0800')
blogInfo['blogTitle'] = u'' + soup.find("div", {"id": "blog_title"}).h2.text
blogInfo['blogDesc'] = u'' + soup.find(id='blog_title').h3.text
logging.debug('Blog Title is %s', blogInfo['blogTitle'])
if not needPermaLink:
blogInfo["permaLink"] = url
return blogInfo
linkNode = soup.find(attrs={"class": "link_title"}).find('a')
if linkNode:
#if the linkNode is like "/davelv/article/details/6191987" concat after "http://blog.csdn.net/"
blogInfo["permaLink"] = linkNode["href"]
else:
logging.error("Can't find permaLink")
return blogInfo
def ExportHead(f, dic, categories=[]):
t = Template(u"""<?xml version="1.0" encoding="UTF-8"?>
<!--
This is a WordPress eXtended RSS file generated by Live Space Mover as an export of
your blog. It contains information about your blog's posts, comments, and
categories. You may use this file to transfer that content from one site to
another. This file is not intended to serve as a complete backup of your
blog.
To import this information into a WordPress blog follow these steps:
1. Log into that blog as an administrator.
2. Go to Manage > Import in the blog's admin.
3. Choose "WordPress" from the list of importers.
4. Upload this file using the form provided on that page.
5. You will first be asked to map the authors in this export file to users
on the blog. For each author, you may choose to map an existing user on
the blog or to create a new user.
6. WordPress will then import each of the posts, comments, and categories
contained in this file onto your blog.
-->
<!-- generator="{programInfo}" created="${nowTime}"-->
<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.1/excerpt/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.1/"
>
<channel>
<title>${blogTitle}</title>
<link>${blogURL}</link>
<description>${blogDesc}</description>
<pubDate>${nowTime}</pubDate>
<generator>${programInfo}</generator>
<language>zh</language>
<wp:wxr_version>1.1</wp:wxr_version>""") #need blogTitle, nowTime, blogURL
catT = Template(u'''
<wp:category><wp:term_id>${categoryId}</wp:term_id><wp:category_nicename>${niceName}</wp:category_nicename><wp:category_parent/><wp:cat_name><![CDATA[${category}]]></wp:cat_name></wp:category>
<wp:tag><wp:term_id>${tagId}</wp:term_id><wp:tag_slug>${niceName}</wp:tag_slug><wp:tag_name><![CDATA[${category}]]></wp:tag_name></wp:tag>''')
catStr = u''
i = -1
for cat in categories:
i = i + 2
logging.debug("Cate:%s", cat)
catStr += catT.substitute(
categoryId=i,
tagId=i + 1,
category=cat,
niceName=urllib2.quote(cat.encode('utf-8'))
)
dic['blogTitle'] = saxutils.escape(dic['blogTitle'])
dic['programInfo'] = u'' + __PROGRAM__ + __VERSION__
f.write(t.substitute(dic))
f.write(catStr)
def GenerateComments(comments):
commentT = Template(u"""
<wp:comment>
<wp:comment_id>${commentId}</wp:comment_id>
<wp:comment_author><![CDATA[${commentAuthor}]]></wp:comment_author>
<wp:comment_author_email></wp:comment_author_email>
<wp:comment_author_url>${authorURL}</wp:comment_author_url>
<wp:comment_author_IP></wp:comment_author_IP>
<wp:comment_date>${commentDate}</wp:comment_date>
<wp:comment_date_gmt>${commentDateGMT}</wp:comment_date_gmt>
<wp:comment_content><![CDATA[${commentContent}]]></wp:comment_content>
<wp:comment_approved>1</wp:comment_approved>
<wp:comment_type></wp:comment_type>
<wp:comment_parent>${parentId}</wp:comment_parent>
</wp:comment>""") #need commentId, commentAuthor, commentEmail, commentURL,commentDate,commentContent
commentsStr = u""
#logging.debug(entry)
for comment in comments:
commentsStr += commentT.substitute(
commentId=comment['CommentId'],
commentAuthor=saxutils.escape(comment['UserName']),
authorURL=u'http://' + csdnHost + u'/' + saxutils.escape(comment['UserName']),
commentDate=comment['PostTime'].strftime('%Y-%m-%d %H:%M:%S'),
commentDateGMT=(comment['PostTime'] - timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'),
commentContent=comment['Content'],
parentId=comment['ParentId'])
#logging.debug(comment['comment'])
return commentsStr
def GeneratePostCategories(categories):
cateT = Template(u"""
<category domain="category" nicename="${niceName}"><![CDATA[${category}]]></category>
<category domain="post_tag" nicename="${niceName}"><![CDATA[${category}]]></category>""")#nedd category niceName
#category
categoryStr = u''
for cate in categories:
categoryStr += cateT.substitute(
category=cate,
niceName=urllib2.quote(cate.encode('utf-8')))
return categoryStr
def GenerateMeta(key, value):
metaT = Template(u"""
<wp:meta_key>${metaKey}</wp:meta_key>
<wp:meta_value><![CDATA[${metaValue}]]></wp:meta_value>
""")
metaStr = metaT.substitute(metaKey=key, metaValue=value)
return metaStr
def GenerateAttatchmentURL(url):
return u"\n <wp:attachment_url>" + url + u"</wp:attachment_url>"
def ExportEntry(f, entry, user):
itemT = Template(u"""
<item>
<title>${entryTitle}</title>
<link>${entryURL}</link>
<pubDate>${pubDate}</pubDate>
<dc:creator>${entryAuthor}</dc:creator>${categories}
<guid isPermaLink="false"></guid>
<description></description>
<content:encoded><![CDATA[${entryContent}]]></content:encoded>
<wp:post_id>${entryId}</wp:post_id>
<wp:post_date>${postDate}</wp:post_date>
<wp:post_date_gmt>${postDateGMT}</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>${postName}</wp:post_name>
<wp:status>${status}</wp:status>
<wp:post_parent>${parentId}</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>${type}</wp:post_type>${attachmentURL}
<wp:postmeta>${postMeta}</wp:postmeta>${comments}
</item>""") #need entryTitle, entryURL, entryAuthor, category, entryContent, entryId, postDate,postDateGMT, pubDate,views
#logging.debug(entry['category'])
itemStr = itemT.substitute(
entryURL='',
entryAuthor=user,
entryId=entry['id'],
entryContent=entry['content'],
status=entry['status'],
parentId=entry['parentId'],
type=entry['type'],
entryTitle=saxutils.escape(entry['title']),
postName=urllib2.quote(entry['title'].encode('utf-8')),
postDate=entry['date'].strftime('%Y-%m-%d %H:%M:%S'),
pubDate=entry['date'].strftime('%a, %d %b %Y %H:%M:%S +0800'),
postDateGMT=(entry['date'] - timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'),
comments=GenerateComments(entry['comments']),
categories=GeneratePostCategories(entry['category']),
postMeta=GenerateMeta(entry['metaKey'], entry['metaValue']),
attachmentURL=u'' if not entry.has_key('url') else GenerateAttatchmentURL(entry['url'])
)
#logging.debug(itemStr)
f.write(itemStr)
def ExportFoot(f):
f.write("""
</channel>
</rss>
""")
f.close()
def LoadCache(fileName='entries.cache'):
entries = []
if not os.path.exists(fileName):
return entries
logging.info('Found cache file')
cacheFile = open(fileName, 'r')
try:
while True:
entry = pickle.load(cacheFile)
logging.info('Load entry from cache file with title %s', entry['title'])
entries.append(entry)
except (pickle.PickleError, EOFError):
logging.info("No more entries in cache file for loading")
finally:
cacheFile.close()
return entries
def LoopFetchEntry(catchFileName, permaLink, isPostOnly=False, limit=0):
count = 0
entries = []
cacheFile = open(catchFileName, 'a')
try:
while permaLink:
item = FetchEntry(permaLink, isPostOnly=isPostOnly)
logging.info("Got a blog entry titled %s with %d comments successfully", item['title'],
len(item['comments']))
entries.append(item)
pickle.dump(item, cacheFile)
cacheFile.flush()
logging.debug("-----------------------")
if 'prevLink' in item:
permaLink = item['prevLink']
else:
break
count += 1
if limit != 0 and count >= limit: break
finally:
cacheFile.close()
return entries
def ArrangeEntries(entries, highlight, isAttach=True):
"""
entries to postEntries, attachmentEntries, categories
"""
logging.info("Arrange entries")
categories = set([])
attachEntries = []
entries.sort(key=lambda e: e['id'])
#sort entries
for en in entries:
#genearte new article id
en['id'] = postIDGenerator.GetID(en['id'])
#category
categories.update(en['category'])
#pretty code and comment
en['content'] = PrettyCode(en['content'], highlight)
for co in en['comments']: co['Content'] = PrettyComment(co['Content'])
#new comment id
en['comments'].sort(key=lambda e: e['CommentId'])
for co in en['comments']:
co['CommentId'] = commentIDGenerator.GetID(co['CommentId'])
co['ParentId'] = commentIDGenerator.GetID(co['ParentId'])
#attachment
if isAttach:
ProcessAttachment(en, attachEntries)
logging.info("Arrange done")
return entries, attachEntries, categories
def main(blogUrl):
#main procedure begin, use optparse for compatible
parser = OptionParser(usage="%prog -s|b URL [Options]\n CSDN博客搬家程序".decode('utf-8'), version="%prog " + __VERSION__)
parser.add_option("-s", "--source", action="store", type="string", dest="srcURL", help="CSDN博客地址".decode('utf-8'))
parser.add_option("-b", "--begin", action="store", type="string", dest="beginURL",
help="指定一个日志链接作为起始地址".decode('utf-8'))
parser.add_option("-n", "--number", action="store", type="int", dest="limit", default=0,
help="导出的日志数目,默认无限制(0)".decode('utf-8'))
parser.add_option("-o", "--postonly", action="store_true", dest="isPostOnly", default=False,
help="不导出日志的评论".decode('utf-8'))
parser.add_option("-a", "--noattach", action="store_false", dest="isAttach", default=True,
help="不处理日志附件(CSDN博客附件不支持外链,慎用)".decode('utf-8'))
parser.add_option("-i", "--idstart", action="store", type="int", dest="startId", default=10,
help="导出日志/评论在Wordpress中起始编号,默认10".decode('utf-8'))
parser.add_option("-l", "--highlight", action="store", type="string", dest="lighttype", default="syntaxhighlight",
help="代码高亮可选syntaxhighlight和geshi两种,默认第一种,需对应插件支持".decode('utf-8'))
(options, args) = parser.parse_args()
options.lighttype = options.lighttype.lower()
if not hlightdict.has_key(options.lighttype):
logging.warning("Hightlight type error,exit")
sys.exit(2)
logging.info("Use code highlight type: %s", options.lighttype)
#ID generate
global postIDGenerator
postIDGenerator = IDGenerator(options.startId)
global commentIDGenerator
commentIDGenerator = IDGenerator(options.startId)
#find blog info
if (blogUrl):
blogInfo = FetchBlogInfo(blogUrl, True)
logging.info('Start fetching from %s', blogUrl)
options.srcURL = blogUrl
elif options.beginURL:
blogInfo = FetchBlogInfo(options.beginURL, False)
logging.info('Start fetching from %s', options.beginURL)
elif options.srcURL:
blogInfo = FetchBlogInfo(options.srcURL, True)
logging.info("Found permaLink %s", blogInfo["permaLink"])
else:
logging.error("Error, you must give either srcURL or beginURL")
sys.exit(2)
#load cache and resume from the last post in it
cacheName = 'entries.cache'
entries = LoadCache(cacheName)
if len(entries) > 0 and not options.beginURL:
permaLink = entries[-1]['prevLink']
else:
permaLink = blogInfo['permaLink']
#main loop, get blog data and
entries.extend(LoopFetchEntry(cacheName, permaLink, options.isPostOnly, options.limit))
#data arrangement
postEntries, attachEntries, categories = ArrangeEntries(entries, options.lighttype, options.isAttach)
#export header
exportFileName = 'export_' + datetime.now().strftime('%Y%m%d-%H%M%S') + '.xml'
f = codecs.open(exportFileName, 'w', 'utf-8')
if f:
logging.info('Export XML to file %s', exportFileName)
else:
logging.error("Can't open export file %s for writing", exportFileName)
sys.exit(2)
ExportHead(f, blogInfo, categories)
logging.debug('Exported header')
#export attachment
for entry in attachEntries:
ExportEntry(f, entry, blogInfo['user'])
#export entries
for entry in postEntries:
ExportEntry(f, entry, blogInfo['user'])
#export Foot
ExportFoot(f)
logging.debug('Exported footer')
#Delete cache file
os.remove(cacheName)
logging.info("Deleted cache file")
logging.info("Finished! Congratulations!")
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG,
format='LINE %(lineno)-4d %(levelname)-8s %(message)s',
datefmt='%m-%d %H:%M',
filename='blog-mover.log',
filemode='w');
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
console.setLevel(logging.INFO)
# set a format which is simpler for console use
formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s')
# tell the handler to use this format
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
try:
main("http://blog.csdn.net/v_july_v")
except SystemExit:
pass
except:
logging.exception("Unexpected error")
raise