-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathteletask-dl.py
executable file
·129 lines (107 loc) · 3.52 KB
/
teletask-dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
import os
import re
import sys
import textwrap
import urllib
def download(src, location):
if location == '':
location = '.'
dest = location + '/' + src.split('/')[-1]
localfile = open(dest, 'wb')
remotefile = urllib.urlopen(src)
print "Downloading"
print "Source: " + src
print "Destination: " + dest + "\n"
localfile.write(remotefile.read())
remotefile.close()
localfile.close()
def print_usage():
print(textwrap.dedent("""\
Usage: %s [URL] [MODE] [LOCATION]
[URL]
The URL of the overview page. (required)
[MODE]
-C Create list of download links.
Does not download anything.
List is writen to stdout but you can pipe it anywhere.
-D Download every podcast.
This may take some time and produce trafic.
default: -C
[LOCATION]
The location to
* the file where the list of links shall be stored (if MODE = -C)
* the folder where to store the podcasts (if MODE = -D).
If no LOCATION is given the link list will be written to stdout
and podcasts will be stored in the current working directory.
""" % sys.argv[0]))
def get_child_urls(src):
parent_content = urllib.urlopen(src).read()
child_urls = []
for line in parent_content.split('\n'):
mo = re.search('/archive/video/ipod/[0-9]+/', line)
if mo:
child_urls.append("http://www.tele-task.de" + mo.group())
return child_urls
def crawl_and_create_list(child_urls, location):
# if location specified: create file appropriately
# else: write to stdout
if location != '':
listfile = open(location, 'wb')
#crawl all child pages, find download link and write it
for url in child_urls:
child_content = urllib.urlopen(url).read()
mo = re.search(
'http://stream.hpi.uni-potsdam.de:8080/download/podcast/.*.mp4',
child_content)
if mo:
if location == '':
print mo.group()
else:
listfile.write(mo.group() + '\n')
# close file if any
if location != '':
listfile.close()
def crawl_and_download(child_urls, location):
# create folder if
# * folder is not current folder (working directory)
# * folder does not yet exist
if location not in ['', '.']:
if not os.path.exists(location):
os.makedirs(location)
# crawl all child pages, find download link and download podcast
for url in child_urls:
child_content = urllib.urlopen(url).read()
mo = re.search(
'http://stream.hpi.uni-potsdam.de:8080/download/podcast/.*.mp4',
child_content)
if mo:
download(mo.group(), location)
def main():
# ensure that at least URL is given
if len(sys.argv) < 2:
print_usage()
exit()
# default values
mode = 'C'
location = ''
# URL
url = sys.argv[1]
# MODE (or LOCATION) if no MODE is given
if len(sys.argv) > 2:
if sys.argv[2] == "-C":
mode = 'C'
elif sys.argv[2] == "-D":
mode = 'D'
else:
mode = 'C'
location = sys.argv[2]
# LOCATION (if not specified before)
if (len(sys.argv) > 3 and location == ''):
location = sys.argv[3]
child_urls = get_child_urls(url)
if mode == "C":
crawl_and_create_list(child_urls, location)
elif mode == "D":
crawl_and_download(child_urls, location)
main()