-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_course_rss.py
194 lines (168 loc) · 10.2 KB
/
make_course_rss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
''' This is a script which will take an edX course export, and create
an RSS feed from it.
I *strongly* recommend running clean_studio_xml on a dump before
running this script.
Limitations:
* This does not pay attention to release dates. To-be-released videos
can appear in the RSS feed.
* Courses must use Youtube videos. I use Youtube as a
transcoder. Google invested millions into doing this well, and I
didn't want to replicate the effort. As a result, if Google changes
things around, we may need to swap things around.
* We don't have course URLs by default. This sure would be nice.
* It would be nice to embed pages for where we have assessments and
interactives. RSS supports this, but the script does not (in part
due to complexity of generating URLs).
If you'd like to do this a lot, generate a Google API key, and use the
environment variables: GOOGLE_DEVID and GOOGLE_DEVKEY. Google locks
you out pretty quickly without those.
'''
import StringIO
import argparse
import datetime
import json
import os
import os.path
import re
import sys
import urlparse
import xml.dom.minidom
import PyRSS2Gen
import helpers
parser = argparse.ArgumentParser(description = "Generate an RSS feed of a course.")
parser.add_argument("export_base", help="Base directory of Studio-dumped XML")
parser.add_argument("url_base", help="URL the feed will be hosted from")
parser.add_argument("--format", help="Format of RSS feed (mp4, webm, 3gp, or m4a)", default='3gp', dest='format')
parser.add_argument("--course_url", help="URL of the course about page", default="https://www.edx.org/", dest="course_url")
parser.add_argument("--output_dir", help="Output directory", default="output", dest="output_dir")
parser.add_argument("--output_file", help="Output filename", dest="output_file")
parser.add_argument("--reverse", help="Reverse order", default="false", dest="reverse", choices=["true", "false"])
args = parser.parse_args()
# Video format params
video_format_parameters = { 'mp4': {'youtube_dl_code' : 'mp4',
'video_extension':'mp4',
'mimetype':'video/mp4',
'video_codec_name': 'MPEG Video',
'codec_description': 'This RSS feed is for MPEG videos. This is the most common video format and should work with most software. '
},
'webm': {'youtube_dl_code' : 'webm',
'video_extension':'webm',
'mimetype':'video/webm',
'video_codec_name': 'WebM Video',
'codec_description': 'This RSS feed is using WebM videos. WebM is an advanced video format developed by Google. This is the recommended feed if your software supports it (most software does not). '
},
'3gp': {'youtube_dl_code' : '3gp',
'video_extension':'3gp',
'mimetype':'video/3gpp',
'video_codec_name': '3GPP Video',
'codec_description': 'This RSS feed is for video files in the 3gpp format. 3gpp is a low-bandwidth format commonly used for video delivered to cell phones. '
},
'm4a': {'youtube_dl_code' : '140',
'video_extension':'m4a',
'mimetype':'audio/mp4a-latm',
'video_codec_name': 'AAC Audio',
'codec_description': 'This is an audio-only RSS feed. It uses the AAC audio codec. '
},
}
video_format = args.format
conf = { 'video_format' : args.format,
'url_base' : args.url_base,
'export_base' : args.export_base,
'course_url':args.course_url,
'output_dir':args.output_dir,
'output_file': args.output_file,
'reverse': args.reverse.lower() == "true",
'mimetype' : video_format_parameters[video_format]['mimetype'],
'codec_description' : video_format_parameters[video_format]['codec_description'],
'video_codec_name' : video_format_parameters[video_format]['video_codec_name'],
'youtube_dl_code' : video_format_parameters[video_format]['youtube_dl_code'],
'video_extension' : video_format_parameters[video_format]['video_extension'],
'podcast_title' : "Videos from {course_org} : {course_name} on edX",
'podcast_description': '''A prototype podcast of the videos from {course_name}, a course from {course_org} on edX. The full course, including assessments, is available, free-of-charge, at {course_url}. {codec_description} Note that this is a podcast of just the videos from an interactive on-line course; in some cases, the videos may be difficult to follow without integrated assessments, simulations, or other interactions at {course_url}. RSS feeds for other codecs are available. For a more complete experience, please visit the full course. ''',
'video_description': '''{youtube_description} {video_location}. This is a prototype podcast of the videos from {course_name}. The full course is available free-of-charge at {course_url}. Note that the full course includes assessments, as well as other interactives (such as simulations, discussions, etc.). Some videos may be difficult to follow without the integrated interactions. For a more complete experience, please visit the full course. ({pretty_length}, {duration}, {video_codec_name}) ''',
}
print "Encoding", conf['export_base']
tree = helpers.load_xml_course(conf['export_base'])
conf.update({'course_org' : tree.getroot().attrib['org'],
'course_number' : tree.getroot().attrib['course'],
'course_id' : tree.getroot().attrib['url_name'],
'course_name' : tree.getroot().attrib['display_name']})
items = []
valid_youtube_id = re.compile("^[0-9a-zA-Z_\-]*$")
## Add videos
for e in tree.iter():
item_dict = dict()
if e.tag in ['video']:
if 'youtube_id_1_0' not in e.attrib:
continue
youtube_id = e.attrib['youtube_id_1_0']
youtube_cache = os.path.join(conf['output_dir'], youtube_id + ".json")
if not os.path.exists(youtube_cache):
youtube_info = helpers.youtube_entry(youtube_id)
f = open(youtube_cache, "w")
json.dump(youtube_info, f)
f.close()
else:
youtube_info = json.load(open(youtube_cache))
item_dict['title'] = e.attrib['display_name']
if item_dict['title'] == 'Video':
item_dict['title'] = youtube_info['title']
item_dict['guid'] = e.attrib['url_name']
if not valid_youtube_id.match(youtube_id):
raise TypeError("Youtube ID has an invalid string. Security issue?")
item_dict['link'] = "https://www.youtube.com/watch?v="+youtube_id
description = list()
node = e
while node != None:
if 'display_name' in node.attrib:
description.append(node.attrib['display_name'])
node = node.parent
description.reverse()
base_filename = youtube_id+"."+conf['video_extension']
dl_filename = os.path.join(conf['output_dir'], base_filename)
if not os.path.exists(dl_filename):
command = "youtube-dl -f {fmt} https://www.youtube.com/watch?v={uid} -o {file}".format(fmt=conf['youtube_dl_code'],
uid=youtube_id,
file=dl_filename)
os.system(command)
length = os.stat(dl_filename).st_size
pretty_length = helpers.format_file_size(length)
youtube_description = youtube_info['description']
if not youtube_description:
youtube_description = ""
if len(youtube_description) > 0 and youtube_description[-1]!=' ':
youtube_description = youtube_description + ' '
item_dict['description'] = conf['video_description'].format(youtube_description = youtube_description.encode('utf-8'),
video_location = (" / ".join(description)).encode('utf-8'),
pretty_length = pretty_length.encode('utf-8'),
duration = youtube_info['duration_str'].encode('utf-8'),
**conf)
item_dict['enclosure'] = PyRSS2Gen.Enclosure(url=urlparse.urljoin(conf['url_base'], base_filename),
length=length,
type=conf['mimetype'])
items.append(PyRSS2Gen.RSSItem(**item_dict))
if conf["reverse"]:
items.reverse()
rss = PyRSS2Gen.RSS2(
title = conf["podcast_title"].format(**conf),
link = args.course_url,
description = conf["podcast_description"].format(**conf),
lastBuildDate = datetime.datetime.now(),
items = items,
managingEditor = "edX Learning Sciences"
)
## Write output to a file
data = StringIO.StringIO()
rss.write_xml(data)
if not conf["output_file"]:
output_filename = "{output_dir}/{org}_{course}_{url_name}_{format}.rss".format(org = conf['course_org'],
course = conf['course_number'],
url_name = conf['course_id'],
format = video_format,
output_dir = conf['output_dir'])
else:
output_filename = conf["output_file"]
f = open(output_filename, "w")
f.write(xml.dom.minidom.parseString(data.getvalue()).toprettyxml().encode('utf-8'))
f.close()
print "Saved ", output_filename