]>
Commit | Line | Data |
---|---|---|
32d687f5 | 1 | # encoding: utf-8 |
2 | from __future__ import unicode_literals | |
3 | ||
4 | import re | |
5 | import json | |
6 | ||
7 | from .common import InfoExtractor | |
8 | from .brightcove import BrightcoveIE | |
9 | ||
10 | from ..compat import ( | |
11 | compat_urllib_parse, | |
12 | compat_urllib_request, | |
13 | ) | |
14 | from ..utils import ( | |
15 | ExtractorError, | |
16 | smuggle_url, | |
17 | std_headers, | |
18 | ) | |
19 | ||
20 | ||
21 | class SafariBaseIE(InfoExtractor): | |
22 | _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' | |
23 | _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]+>Sign Out</a>' | |
24 | _ACCOUNT_CREDENTIALS_HINT = ('Use --username and --password options to ' | |
25 | 'supply credentials for safaribooksonline.com ') | |
26 | _NETRC_MACHINE = 'safaribooksonline' | |
27 | ||
28 | LOGGED_IN = False | |
29 | ||
30 | def _real_initialize(self): | |
31 | # We only need to log in once for courses or individual videos | |
32 | if not SafariBaseIE.LOGGED_IN: | |
33 | self._login() | |
34 | SafariBaseIE.LOGGED_IN = True | |
35 | ||
36 | def _login(self): | |
37 | (username, password) = self._get_login_info() | |
38 | if username is None: | |
39 | raise ExtractorError( | |
40 | self._ACCOUNT_CREDENTIALS_HINT, | |
41 | expected=True) | |
42 | ||
43 | headers = std_headers | |
44 | if 'Referer' not in headers: | |
45 | headers['Referer'] = self._LOGIN_URL | |
46 | ||
47 | login_page = self._download_webpage( | |
48 | self._LOGIN_URL, None, | |
49 | 'Downloading login form') | |
50 | ||
51 | csrf = self._html_search_regex( | |
52 | r"<input +type='hidden' +name='csrfmiddlewaretoken' +value='([^']+)' +/>", | |
53 | login_page, 'csrf token') | |
54 | ||
55 | login_form = { | |
56 | 'csrfmiddlewaretoken': csrf, | |
57 | 'email': username, | |
58 | 'password1': password, | |
59 | 'login': 'Sign In', | |
60 | 'next': '', | |
61 | } | |
62 | ||
63 | request = compat_urllib_request.Request( | |
64 | self._LOGIN_URL, compat_urllib_parse.urlencode(login_form), headers=headers) | |
65 | login_page = self._download_webpage( | |
66 | request, None, 'Logging in as %s' % username) | |
67 | ||
68 | if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: | |
69 | raise ExtractorError('Login failed; make sure your credentials are correct and ' | |
70 | 'try again.', expected=True) | |
71 | ||
72 | self.to_screen('Login successful') | |
73 | ||
74 | ||
75 | class SafariIE(SafariBaseIE): | |
76 | IE_NAME = 'safari' | |
77 | IE_DESC = 'safaribooksonline.com online video' | |
78 | _VALID_URL = (r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/' | |
79 | '(?P<id>\d+)/(?P<part>part\d+)\.html') | |
80 | _TEST = { | |
81 | 'url': ('https://www.safaribooksonline.com/library/view/' | |
82 | 'hadoop-fundamentals-livelessons/9780133392838/part00.html'), | |
83 | 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', | |
84 | 'info_dict': { | |
85 | 'id': '9780133392838', | |
86 | 'ext': 'mp4', | |
87 | 'title': 'Introduction', | |
88 | } | |
89 | } | |
90 | ||
91 | def _real_extract(self, url): | |
92 | mobj = re.match(self._VALID_URL, url) | |
93 | part = mobj.group('part') | |
94 | ||
95 | webpage = self._download_webpage(url, part) | |
96 | bc_url = BrightcoveIE._extract_brightcove_url(webpage) | |
97 | if not bc_url: | |
98 | raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) | |
99 | ||
100 | return { | |
101 | '_type': 'url', | |
102 | 'url': smuggle_url(bc_url, {'Referer': url}), | |
103 | 'ie_key': 'Brightcove' | |
104 | } | |
105 | ||
106 | ||
107 | class SafariCourseIE(SafariBaseIE): | |
108 | IE_NAME = 'safari:course' | |
109 | IE_DESC = 'safaribooksonline.com online courses' | |
110 | ||
111 | _VALID_URL = (r'https?://(?:www\.)?safaribooksonline\.com/library/view/' | |
112 | '(?P<course_path>[^/]+)/(?P<id>\d+)/?$') | |
113 | ||
114 | _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' | |
115 | _API_FORMAT = 'json' | |
116 | ||
117 | def _real_extract(self, url): | |
118 | mobj = re.match(self._VALID_URL, url) | |
119 | course_path = mobj.group('course_path') | |
120 | course_id = mobj.group('id') | |
121 | ||
122 | webpage = self._download_webpage( | |
123 | '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), | |
124 | course_path, 'Downloading course JSON') | |
125 | ||
126 | course_json = json.loads(webpage) | |
127 | ||
128 | if 'chapters' not in course_json: | |
129 | raise ExtractorError('No chapters found for course %s' % course_id, expected=True) | |
130 | ||
131 | num_parts = len(course_json['chapters']) | |
132 | parts = ['%02d' % part for part in range(num_parts)] | |
133 | ||
134 | entries = [ | |
135 | self.url_result( | |
136 | 'https://www.safaribooksonline.com/library/view/%s/%s/part%s.html' % (course_path, | |
137 | course_id, | |
138 | part_id), | |
139 | 'Safari') | |
140 | for part_id in parts] | |
141 | ||
142 | course_title = course_json['title'] | |
143 | ||
144 | return self.playlist_result(entries, course_id, course_title) |