]>
Commit | Line | Data |
---|---|---|
f42c1907 PH |
1 | from __future__ import unicode_literals |
2 | ||
291a168b | 3 | import re |
291a168b PH |
4 | |
5 | from .common import InfoExtractor | |
6 | from ..utils import ( | |
291a168b PH |
7 | ExtractorError, |
8 | orderedSet, | |
9 | unescapeHTML, | |
10 | ) | |
11 | ||
12 | ||
13 | class StanfordOpenClassroomIE(InfoExtractor): | |
f42c1907 PH |
14 | IE_NAME = 'stanfordoc' |
15 | IE_DESC = 'Stanford Open ClassRoom' | |
16 | _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' | |
6f5ac90c | 17 | _TEST = { |
f42c1907 PH |
18 | 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', |
19 | 'md5': '544a9468546059d4e80d76265b0443b8', | |
20 | 'info_dict': { | |
21 | 'id': 'PracticalUnix_intro-environment', | |
22 | 'ext': 'mp4', | |
23 | 'title': 'Intro Environment', | |
6f5ac90c PH |
24 | } |
25 | } | |
291a168b PH |
26 | |
27 | def _real_extract(self, url): | |
5ad28e7f | 28 | mobj = self._match_valid_url(url) |
291a168b | 29 | |
f42c1907 | 30 | if mobj.group('course') and mobj.group('video'): # A specific video |
291a168b PH |
31 | course = mobj.group('course') |
32 | video = mobj.group('video') | |
33 | info = { | |
34 | 'id': course + '_' + video, | |
35 | 'uploader': None, | |
36 | 'upload_date': None, | |
37 | } | |
38 | ||
291a168b PH |
39 | baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' |
40 | xmlUrl = baseUrl + video + '.xml' | |
baa7b197 | 41 | mdoc = self._download_xml(xmlUrl, info['id']) |
291a168b PH |
42 | try: |
43 | info['title'] = mdoc.findall('./title')[0].text | |
44 | info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text | |
45 | except IndexError: | |
f42c1907 PH |
46 | raise ExtractorError('Invalid metadata XML file') |
47 | return info | |
48 | elif mobj.group('course'): # A course page | |
291a168b PH |
49 | course = mobj.group('course') |
50 | info = { | |
51 | 'id': course, | |
f42c1907 | 52 | '_type': 'playlist', |
291a168b PH |
53 | 'uploader': None, |
54 | 'upload_date': None, | |
55 | } | |
56 | ||
f42c1907 PH |
57 | coursepage = self._download_webpage( |
58 | url, info['id'], | |
59 | note='Downloading course info page', | |
60 | errnote='Unable to download course info page') | |
291a168b | 61 | |
f42c1907 PH |
62 | info['title'] = self._html_search_regex( |
63 | r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) | |
291a168b | 64 | |
f42c1907 PH |
65 | info['description'] = self._html_search_regex( |
66 | r'(?s)<description>([^<]+)</description>', | |
67 | coursepage, 'description', fatal=False) | |
291a168b | 68 | |
197224b7 | 69 | links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) |
f42c1907 PH |
70 | info['entries'] = [self.url_result( |
71 | 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) | |
72 | ) for l in links] | |
73 | return info | |
74 | else: # Root page | |
291a168b PH |
75 | info = { |
76 | 'id': 'Stanford OpenClassroom', | |
f42c1907 | 77 | '_type': 'playlist', |
291a168b PH |
78 | 'uploader': None, |
79 | 'upload_date': None, | |
80 | } | |
f42c1907 | 81 | info['title'] = info['id'] |
291a168b | 82 | |
291a168b | 83 | rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' |
baa7b197 | 84 | rootpage = self._download_webpage(rootURL, info['id'], |
9e1a5b84 | 85 | errnote='Unable to download course info page') |
291a168b | 86 | |
197224b7 | 87 | links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) |
f42c1907 PH |
88 | info['entries'] = [self.url_result( |
89 | 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) | |
90 | ) for l in links] | |
91 | return info |