From c4edb269d2c73d3f70f4de14e5770c9e96f10dd9 Mon Sep 17 00:00:00 2001 From: Endericedragon <43962007+Endericedragon@users.noreply.github.com> Date: Thu, 12 Mar 2020 22:03:48 +0800 Subject: [PATCH] v0.03 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新的v0.03来了,这次它可以下载pdf课件了! The v0.03 is coming, and now it is able to download pdf files! Download from Lanzou Cloud: https://www.lanzous.com/ia6vxdi p.s:Extract the 7z file from Laznou Cloud and you can run the __main__.py as long as you've installed python. --- createFolder.py | 2 +- network.py | 47 +++++++++++++++++++++++++++++++++++++++++------ parseList.py | 44 +++++++++++++++++++++++++++++++++++++++++++- process.py | 11 ++++++----- 4 files changed, 91 insertions(+), 13 deletions(-) diff --git a/createFolder.py b/createFolder.py index 16d9584..e82f70a 100644 --- a/createFolder.py +++ b/createFolder.py @@ -24,7 +24,7 @@ def creater(courseName, courseDict): url = info['url'] path = courseName+'\\'+week+'\\'+section #network.ariaDown(url, fileName, path) - if (fileName[-4:]=='m3u8'): + if (fileName[-4:]=='m3u8' or info['format']=='pdf'): network.urllibDown(url, fileName, path) else: p.apply_async(network.ariaDown, args=(url, fileName, path)) diff --git a/network.py b/network.py index 1f47ce7..f2e42a0 100644 --- a/network.py +++ b/network.py @@ -117,11 +117,46 @@ def urllibDown(url, fileName, path): if (os.path.exists(path+'\\'+fileName) or os.path.exists(path+'\\'+fileName) or os.path.exists(path+'\\'+fileName)): print('File exists.{0} won\'t be downloaded.'.format(fileName)) return 0 + for i in range(1,6): + try: + print('正在下载'+fileName+'...') + r = requests.get(url, headers = hd, timeout = 2) + break + except: + print('重试...第%d次' % (i)) + with open(path+'\\'+fileName, 'wb') as f: + f.write(r.content) + print(fileName, '下载完成。') + +def getPdfUrl(name, conId, Id): + pdfUrl = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr' + pdfData = { + 'callCount':'1', + 'scriptSessionId':'${scriptSessionId}190', + 'httpSessionId':'0397251039424db2b1659352f45d1540', + 'c0-scriptName':'CourseBean', + 'c0-methodName':'getLessonUnitLearnVo', + 'c0-id':'0', + 'c0-param0':'number:'+conId, #(contentId) + 'c0-param1':'number:3', + 'c0-param2':'number:0', + 'c0-param3':'number:'+Id, #(id) + 'batchId':'1584017274511' + } try: - print('正在下载'+fileName+'...') - res = urllib.request.urlopen(url) - with open(path+'\\'+fileName, 'wb') as f: - f.write(res.read()) + r = requests.post(pdfUrl, headers = hd, data = pdfData) + r.encoding = r.apparent_encoding + r.raise_for_status() except: - print('Urllib error!') - return -1 + return '' + gotIt = r.text + pdfGet = re.search('textOrigUrl:\".+?\"', gotIt) + if pdfGet: + pdfGet = pdfGet.group()[13:-1] + else: + pdfGet = '' + return pdfGet +def test(): + getPdfUrl('wow', '1003316105', '1214227359') +if __name__ == '__main__': + test() \ No newline at end of file diff --git a/parseList.py b/parseList.py index ef495af..8c1aa10 100644 --- a/parseList.py +++ b/parseList.py @@ -1,8 +1,42 @@ import re from network import getVideoUrl +from network import getPdfUrl + +def char2int(n): + indexDict = { + '一':1, + '二':2, + '三':3, + '四':4, + '五':5, + '六':6, + '七':7, + '八':8, + '九':9, + '十':10 + } + if len(n)==1: + return str(indexDict[n]) + elif len(n)==2 and n[0]=='十': + return str(10+indexDict[n[1]]) + elif len(n)==2 and n[1]=='十': + return str(indexDict[n[0]]*10) + elif len(n)==2: + return str(indexDict[n[0]]*10+indexDict[n[1]]) + else: + return str(indexDict[n[0]]*10+indexDict[n[2]]) def namer(name, num='k'): - forbid = re.compile('[\\/:\*\?\"<>\|]') + weeksForbid1 = re.compile('^第[一二三四五六七八九十][讲周]') + weeksForbid2 = re.compile('^第[一二三四五六七八九十][一二三四五六七八九十][讲周]') + weeksForbid3 = re.compile('^第[一二三四五六七八九十][一二三四五六七八九十][一二三四五六七八九十][讲周]') + if weeksForbid1.match(name): + name = name[0]+char2int(name[1])+name[2:] + elif weeksForbid2.match(name): + name = name[0]+char2int(name[1:3])+name[3:] + elif weeksForbid3.match(name): + name = name[0]+char2int(name[1:4])+name[4:] + forbid = re.compile('[\\/:\*\?\"<>\|\s]') legal_name = '' for each in name: if forbid.match(each): @@ -64,6 +98,7 @@ def parser(courseList:str, courseUrl)->dict: 一级目录 无chapterId 无contentId contentType=1 二级目录 有chapterId 无contentId contentType=1 视频文件 有chapterId 有contentId contentType=1 + pdf文件 有chapterId 有contentId contentType=3 ''' courseList = courseList.split('\n') for line in courseList: @@ -95,5 +130,12 @@ def parser(courseList:str, courseUrl)->dict: 'format':yes[1] } cc+=1 + elif (getchId(line) and getcoId(line) and getcoType(line)=='3'): + # ~ print('pdf课件') + ok = getPdfUrl(getName(line), getcoId(line), getId(line)) + temp2[namer(getName(line))]={ + 'url':ok, + 'format':'pdf' + } return courseDict diff --git a/process.py b/process.py index a68af58..20dc5fa 100644 --- a/process.py +++ b/process.py @@ -1,6 +1,7 @@ import re import os import pickle +import network from multiprocessing import Pool def namer(num): @@ -22,7 +23,7 @@ def m3u8Process(courseName, courseDict): for vidName, detail in vidInfo.items(): #vidName没有后缀名 if detail['format']!='m3u8': continue - elif os.path.exists(path+'\\'+vidName+'.mp4'): + elif os.path.exists(path+'\\'+vidName+'.ts'): continue for vid in os.listdir(path): #vid带有后缀名 if re.search(vidName, vid) and 'mp4' not in vid: @@ -37,17 +38,17 @@ def m3u8Process(courseName, courseDict): for each in tsInfo: each = each.strip() if each[0]!='#': - # ~ os.system('{0} -n -i \"{1}\" \"{2}\\{3}.mp4\"'.format(fg, url+each, path, namer(cc))) - p.apply_async(os.system, args = ('{0} -n -i \"{1}\" \"{2}\\{3}.mp4\"'.format(fg, url+each, path, namer(cc)),)) + # ~ p.apply_async(os.system, args = ('{0} -n -i \"{1}\" \"{2}\\{3}.ts\"'.format(fg, url+each, path, namer(cc)),)) + p.apply_async(network.urllibDown, args = (url+each, namer(cc)+'.ts', path)) cc+=1 p.close() p.join() print(vidName, '的分片文件全部下载完成') files = 'concat:' for i in range(cc): - files+=(path+'\\'+namer(i)+'.mp4|') + files+=(path+'\\'+namer(i)+'.ts|') files = files[:-1] os.system('{0} -n -i \"{1}\" \"{2}\\{3}.mp4\"'.format(fg, files, path, vidName)) os.remove(path+'\\'+vid) for ck in range(cc): - os.system('del \"{0}\\{1}.mp4\"'.format(path, namer(ck))) + os.system('del \"{0}\\{1}.ts\"'.format(path, namer(ck)))