]>
Commit | Line | Data |
---|---|---|
291a168b PH |
1 | import re |
2 | import socket | |
3 | import xml.etree.ElementTree | |
4 | ||
5 | from .common import InfoExtractor | |
6 | from ..utils import ( | |
7 | compat_http_client, | |
8 | compat_str, | |
9 | compat_urllib_error, | |
10 | compat_urllib_request, | |
11 | ||
12 | ExtractorError, | |
13 | orderedSet, | |
14 | unescapeHTML, | |
15 | ) | |
16 | ||
17 | ||
18 | class StanfordOpenClassroomIE(InfoExtractor): | |
291a168b | 19 | IE_NAME = u'stanfordoc' |
0f818663 PH |
20 | IE_DESC = u'Stanford Open ClassRoom' |
21 | _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' | |
6f5ac90c PH |
22 | _TEST = { |
23 | u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', | |
24 | u'file': u'PracticalUnix_intro-environment.mp4', | |
25 | u'md5': u'544a9468546059d4e80d76265b0443b8', | |
26 | u'info_dict': { | |
27 | u"title": u"Intro Environment" | |
28 | } | |
29 | } | |
291a168b PH |
30 | |
31 | def _real_extract(self, url): | |
32 | mobj = re.match(self._VALID_URL, url) | |
33 | if mobj is None: | |
34 | raise ExtractorError(u'Invalid URL: %s' % url) | |
35 | ||
36 | if mobj.group('course') and mobj.group('video'): # A specific video | |
37 | course = mobj.group('course') | |
38 | video = mobj.group('video') | |
39 | info = { | |
40 | 'id': course + '_' + video, | |
41 | 'uploader': None, | |
42 | 'upload_date': None, | |
43 | } | |
44 | ||
45 | self.report_extraction(info['id']) | |
46 | baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' | |
47 | xmlUrl = baseUrl + video + '.xml' | |
48 | try: | |
49 | metaXml = compat_urllib_request.urlopen(xmlUrl).read() | |
50 | except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: | |
51 | raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) | |
52 | mdoc = xml.etree.ElementTree.fromstring(metaXml) | |
53 | try: | |
54 | info['title'] = mdoc.findall('./title')[0].text | |
55 | info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text | |
56 | except IndexError: | |
57 | raise ExtractorError(u'Invalid metadata XML file') | |
58 | info['ext'] = info['url'].rpartition('.')[2] | |
59 | return [info] | |
60 | elif mobj.group('course'): # A course page | |
61 | course = mobj.group('course') | |
62 | info = { | |
63 | 'id': course, | |
64 | 'type': 'playlist', | |
65 | 'uploader': None, | |
66 | 'upload_date': None, | |
67 | } | |
68 | ||
69 | coursepage = self._download_webpage(url, info['id'], | |
70 | note='Downloading course info page', | |
71 | errnote='Unable to download course info page') | |
72 | ||
73 | info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) | |
74 | ||
75 | info['description'] = self._html_search_regex('<description>([^<]+)</description>', | |
76 | coursepage, u'description', fatal=False) | |
77 | ||
78 | links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) | |
79 | info['list'] = [ | |
80 | { | |
81 | 'type': 'reference', | |
82 | 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), | |
83 | } | |
84 | for vpage in links] | |
85 | results = [] | |
86 | for entry in info['list']: | |
87 | assert entry['type'] == 'reference' | |
88 | results += self.extract(entry['url']) | |
89 | return results | |
90 | else: # Root page | |
91 | info = { | |
92 | 'id': 'Stanford OpenClassroom', | |
93 | 'type': 'playlist', | |
94 | 'uploader': None, | |
95 | 'upload_date': None, | |
96 | } | |
97 | ||
98 | self.report_download_webpage(info['id']) | |
99 | rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' | |
100 | try: | |
101 | rootpage = compat_urllib_request.urlopen(rootURL).read() | |
102 | except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: | |
103 | raise ExtractorError(u'Unable to download course info page: ' + compat_str(err)) | |
104 | ||
105 | info['title'] = info['id'] | |
106 | ||
107 | links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) | |
108 | info['list'] = [ | |
109 | { | |
110 | 'type': 'reference', | |
111 | 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), | |
112 | } | |
113 | for cpage in links] | |
114 | ||
115 | results = [] | |
116 | for entry in info['list']: | |
117 | assert entry['type'] == 'reference' | |
118 | results += self.extract(entry['url']) | |
119 | return results |