]>
Commit | Line | Data |
---|---|---|
291a168b | 1 | import re |
291a168b PH |
2 | |
3 | from .common import InfoExtractor | |
4 | from ..utils import ( | |
291a168b PH |
5 | ExtractorError, |
6 | orderedSet, | |
7 | unescapeHTML, | |
8 | ) | |
9 | ||
10 | ||
11 | class StanfordOpenClassroomIE(InfoExtractor): | |
291a168b | 12 | IE_NAME = u'stanfordoc' |
0f818663 | 13 | IE_DESC = u'Stanford Open ClassRoom' |
c0ade33e | 14 | _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' |
6f5ac90c PH |
15 | _TEST = { |
16 | u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', | |
17 | u'file': u'PracticalUnix_intro-environment.mp4', | |
18 | u'md5': u'544a9468546059d4e80d76265b0443b8', | |
19 | u'info_dict': { | |
20 | u"title": u"Intro Environment" | |
21 | } | |
22 | } | |
291a168b PH |
23 | |
24 | def _real_extract(self, url): | |
25 | mobj = re.match(self._VALID_URL, url) | |
26 | if mobj is None: | |
27 | raise ExtractorError(u'Invalid URL: %s' % url) | |
28 | ||
29 | if mobj.group('course') and mobj.group('video'): # A specific video | |
30 | course = mobj.group('course') | |
31 | video = mobj.group('video') | |
32 | info = { | |
33 | 'id': course + '_' + video, | |
34 | 'uploader': None, | |
35 | 'upload_date': None, | |
36 | } | |
37 | ||
38 | self.report_extraction(info['id']) | |
39 | baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' | |
40 | xmlUrl = baseUrl + video + '.xml' | |
baa7b197 | 41 | mdoc = self._download_xml(xmlUrl, info['id']) |
291a168b PH |
42 | try: |
43 | info['title'] = mdoc.findall('./title')[0].text | |
44 | info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text | |
45 | except IndexError: | |
46 | raise ExtractorError(u'Invalid metadata XML file') | |
47 | info['ext'] = info['url'].rpartition('.')[2] | |
48 | return [info] | |
49 | elif mobj.group('course'): # A course page | |
50 | course = mobj.group('course') | |
51 | info = { | |
52 | 'id': course, | |
53 | 'type': 'playlist', | |
54 | 'uploader': None, | |
55 | 'upload_date': None, | |
56 | } | |
57 | ||
58 | coursepage = self._download_webpage(url, info['id'], | |
59 | note='Downloading course info page', | |
60 | errnote='Unable to download course info page') | |
61 | ||
62 | info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) | |
63 | ||
64 | info['description'] = self._html_search_regex('<description>([^<]+)</description>', | |
65 | coursepage, u'description', fatal=False) | |
66 | ||
67 | links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) | |
68 | info['list'] = [ | |
69 | { | |
70 | 'type': 'reference', | |
71 | 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), | |
72 | } | |
73 | for vpage in links] | |
74 | results = [] | |
75 | for entry in info['list']: | |
76 | assert entry['type'] == 'reference' | |
77 | results += self.extract(entry['url']) | |
78 | return results | |
79 | else: # Root page | |
80 | info = { | |
81 | 'id': 'Stanford OpenClassroom', | |
82 | 'type': 'playlist', | |
83 | 'uploader': None, | |
84 | 'upload_date': None, | |
85 | } | |
86 | ||
291a168b | 87 | rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' |
baa7b197 JMF |
88 | rootpage = self._download_webpage(rootURL, info['id'], |
89 | errnote=u'Unable to download course info page') | |
291a168b PH |
90 | |
91 | info['title'] = info['id'] | |
92 | ||
93 | links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) | |
94 | info['list'] = [ | |
95 | { | |
96 | 'type': 'reference', | |
97 | 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), | |
98 | } | |
99 | for cpage in links] | |
100 | ||
101 | results = [] | |
102 | for entry in info['list']: | |
103 | assert entry['type'] == 'reference' | |
104 | results += self.extract(entry['url']) | |
105 | return results |