]>
Commit | Line | Data |
---|---|---|
291a168b | 1 | import re |
291a168b PH |
2 | |
3 | from .common import InfoExtractor | |
4 | from ..utils import ( | |
291a168b PH |
5 | ExtractorError, |
6 | orderedSet, | |
7 | unescapeHTML, | |
8 | ) | |
9 | ||
10 | ||
11 | class StanfordOpenClassroomIE(InfoExtractor): | |
f42c1907 PH |
12 | IE_NAME = 'stanfordoc' |
13 | IE_DESC = 'Stanford Open ClassRoom' | |
14 | _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' | |
6f5ac90c | 15 | _TEST = { |
f42c1907 PH |
16 | 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', |
17 | 'md5': '544a9468546059d4e80d76265b0443b8', | |
18 | 'info_dict': { | |
19 | 'id': 'PracticalUnix_intro-environment', | |
20 | 'ext': 'mp4', | |
21 | 'title': 'Intro Environment', | |
add96eb9 | 22 | }, |
6f5ac90c | 23 | } |
291a168b PH |
24 | |
25 | def _real_extract(self, url): | |
5ad28e7f | 26 | mobj = self._match_valid_url(url) |
291a168b | 27 | |
f42c1907 | 28 | if mobj.group('course') and mobj.group('video'): # A specific video |
291a168b PH |
29 | course = mobj.group('course') |
30 | video = mobj.group('video') | |
31 | info = { | |
32 | 'id': course + '_' + video, | |
33 | 'uploader': None, | |
34 | 'upload_date': None, | |
35 | } | |
36 | ||
add96eb9 | 37 | base_url = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' |
38 | xml_url = base_url + video + '.xml' | |
39 | mdoc = self._download_xml(xml_url, info['id']) | |
291a168b PH |
40 | try: |
41 | info['title'] = mdoc.findall('./title')[0].text | |
add96eb9 | 42 | info['url'] = base_url + mdoc.findall('./videoFile')[0].text |
291a168b | 43 | except IndexError: |
f42c1907 PH |
44 | raise ExtractorError('Invalid metadata XML file') |
45 | return info | |
46 | elif mobj.group('course'): # A course page | |
291a168b PH |
47 | course = mobj.group('course') |
48 | info = { | |
49 | 'id': course, | |
f42c1907 | 50 | '_type': 'playlist', |
291a168b PH |
51 | 'uploader': None, |
52 | 'upload_date': None, | |
53 | } | |
54 | ||
f42c1907 PH |
55 | coursepage = self._download_webpage( |
56 | url, info['id'], | |
57 | note='Downloading course info page', | |
58 | errnote='Unable to download course info page') | |
291a168b | 59 | |
f42c1907 PH |
60 | info['title'] = self._html_search_regex( |
61 | r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) | |
291a168b | 62 | |
f42c1907 PH |
63 | info['description'] = self._html_search_regex( |
64 | r'(?s)<description>([^<]+)</description>', | |
65 | coursepage, 'description', fatal=False) | |
291a168b | 66 | |
197224b7 | 67 | links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) |
f42c1907 | 68 | info['entries'] = [self.url_result( |
add96eb9 | 69 | f'http://openclassroom.stanford.edu/MainFolder/{unescapeHTML(l)}', |
f42c1907 PH |
70 | ) for l in links] |
71 | return info | |
72 | else: # Root page | |
291a168b PH |
73 | info = { |
74 | 'id': 'Stanford OpenClassroom', | |
f42c1907 | 75 | '_type': 'playlist', |
291a168b PH |
76 | 'uploader': None, |
77 | 'upload_date': None, | |
78 | } | |
f42c1907 | 79 | info['title'] = info['id'] |
291a168b | 80 | |
add96eb9 | 81 | root_url = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' |
82 | rootpage = self._download_webpage(root_url, info['id'], | |
9e1a5b84 | 83 | errnote='Unable to download course info page') |
291a168b | 84 | |
197224b7 | 85 | links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) |
f42c1907 | 86 | info['entries'] = [self.url_result( |
add96eb9 | 87 | f'http://openclassroom.stanford.edu/MainFolder/{unescapeHTML(l)}', |
f42c1907 PH |
88 | ) for l in links] |
89 | return info |