]>
Commit | Line | Data |
---|---|---|
659e93fc S |
1 | from __future__ import unicode_literals |
2 | ||
3 | import json | |
4 | import random | |
5 | import re | |
6 | ||
7 | from .common import InfoExtractor | |
8 | from ..compat import ( | |
9 | compat_b64decode, | |
10 | compat_HTTPError, | |
29f7c58a | 11 | compat_str, |
659e93fc S |
12 | ) |
13 | from ..utils import ( | |
29f7c58a | 14 | clean_html, |
659e93fc | 15 | ExtractorError, |
29f7c58a | 16 | js_to_json, |
17 | parse_duration, | |
18 | try_get, | |
19 | unified_timestamp, | |
659e93fc S |
20 | urlencode_postdata, |
21 | urljoin, | |
22 | ) | |
23 | ||
24 | ||
25 | class LinuxAcademyIE(InfoExtractor): | |
26 | _VALID_URL = r'''(?x) | |
27 | https?:// | |
28 | (?:www\.)?linuxacademy\.com/cp/ | |
29 | (?: | |
30 | courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)| | |
31 | modules/view/id/(?P<course_id>\d+) | |
32 | ) | |
33 | ''' | |
34 | _TESTS = [{ | |
29f7c58a | 35 | 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675', |
659e93fc | 36 | 'info_dict': { |
29f7c58a | 37 | 'id': '7971-2', |
659e93fc | 38 | 'ext': 'mp4', |
29f7c58a | 39 | 'title': 'What Is Data Science', |
40 | 'description': 'md5:c574a3c20607144fb36cb65bdde76c99', | |
037cc66e | 41 | 'timestamp': int, # The timestamp and upload date changes |
42 | 'upload_date': r're:\d+', | |
29f7c58a | 43 | 'duration': 304, |
659e93fc S |
44 | }, |
45 | 'params': { | |
46 | 'skip_download': True, | |
47 | }, | |
48 | 'skip': 'Requires Linux Academy account credentials', | |
49 | }, { | |
50 | 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', | |
51 | 'only_matching': True, | |
52 | }, { | |
53 | 'url': 'https://linuxacademy.com/cp/modules/view/id/154', | |
54 | 'info_dict': { | |
55 | 'id': '154', | |
56 | 'title': 'AWS Certified Cloud Practitioner', | |
29f7c58a | 57 | 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c', |
58 | 'duration': 28835, | |
659e93fc S |
59 | }, |
60 | 'playlist_count': 41, | |
61 | 'skip': 'Requires Linux Academy account credentials', | |
037cc66e | 62 | }, { |
63 | 'url': 'https://linuxacademy.com/cp/modules/view/id/39', | |
64 | 'info_dict': { | |
65 | 'id': '39', | |
66 | 'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)', | |
67 | 'description': 'md5:0f1d3369e90c3fb14a79813b863c902f', | |
68 | 'duration': 89280, | |
69 | }, | |
70 | 'playlist_count': 73, | |
71 | 'skip': 'Requires Linux Academy account credentials', | |
659e93fc S |
72 | }] |
73 | ||
74 | _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' | |
75 | _ORIGIN_URL = 'https://linuxacademy.com' | |
76 | _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' | |
77 | _NETRC_MACHINE = 'linuxacademy' | |
78 | ||
79 | def _real_initialize(self): | |
80 | self._login() | |
81 | ||
82 | def _login(self): | |
83 | username, password = self._get_login_info() | |
84 | if username is None: | |
85 | return | |
86 | ||
87 | def random_string(): | |
88 | return ''.join([ | |
89 | random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') | |
90 | for _ in range(32)]) | |
91 | ||
92 | webpage, urlh = self._download_webpage_handle( | |
93 | self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ | |
94 | 'client_id': self._CLIENT_ID, | |
95 | 'response_type': 'token id_token', | |
29f7c58a | 96 | 'response_mode': 'web_message', |
659e93fc S |
97 | 'redirect_uri': self._ORIGIN_URL, |
98 | 'scope': 'openid email user_impersonation profile', | |
99 | 'audience': self._ORIGIN_URL, | |
100 | 'state': random_string(), | |
101 | 'nonce': random_string(), | |
102 | }) | |
103 | ||
104 | login_data = self._parse_json( | |
105 | self._search_regex( | |
106 | r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, | |
107 | 'login info', group='value'), None, | |
108 | transform_source=lambda x: compat_b64decode(x).decode('utf-8') | |
109 | )['extraParams'] | |
110 | ||
111 | login_data.update({ | |
112 | 'client_id': self._CLIENT_ID, | |
113 | 'redirect_uri': self._ORIGIN_URL, | |
114 | 'tenant': 'lacausers', | |
3700c7ef | 115 | 'connection': 'Username-Password-ACG-Proxy', |
659e93fc S |
116 | 'username': username, |
117 | 'password': password, | |
118 | 'sso': 'true', | |
119 | }) | |
120 | ||
7947a1f7 | 121 | login_state_url = urlh.geturl() |
659e93fc S |
122 | |
123 | try: | |
124 | login_page = self._download_webpage( | |
125 | 'https://login.linuxacademy.com/usernamepassword/login', None, | |
126 | 'Downloading login page', data=json.dumps(login_data).encode(), | |
127 | headers={ | |
128 | 'Content-Type': 'application/json', | |
129 | 'Origin': 'https://login.linuxacademy.com', | |
130 | 'Referer': login_state_url, | |
131 | }) | |
132 | except ExtractorError as e: | |
133 | if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: | |
134 | error = self._parse_json(e.cause.read(), None) | |
135 | message = error.get('description') or error['code'] | |
136 | raise ExtractorError( | |
137 | '%s said: %s' % (self.IE_NAME, message), expected=True) | |
138 | raise | |
139 | ||
140 | callback_page, urlh = self._download_webpage_handle( | |
141 | 'https://login.linuxacademy.com/login/callback', None, | |
142 | 'Downloading callback page', | |
143 | data=urlencode_postdata(self._hidden_inputs(login_page)), | |
144 | headers={ | |
145 | 'Content-Type': 'application/x-www-form-urlencoded', | |
146 | 'Origin': 'https://login.linuxacademy.com', | |
147 | 'Referer': login_state_url, | |
148 | }) | |
149 | ||
150 | access_token = self._search_regex( | |
7947a1f7 | 151 | r'access_token=([^=&]+)', urlh.geturl(), |
29f7c58a | 152 | 'access token', default=None) |
153 | if not access_token: | |
154 | access_token = self._parse_json( | |
155 | self._search_regex( | |
156 | r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page, | |
157 | 'authorization response'), None, | |
158 | transform_source=js_to_json)['response']['access_token'] | |
659e93fc S |
159 | |
160 | self._download_webpage( | |
161 | 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' | |
162 | % access_token, None, 'Downloading token validation page') | |
163 | ||
164 | def _real_extract(self, url): | |
165 | mobj = re.match(self._VALID_URL, url) | |
166 | chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') | |
167 | item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) | |
168 | ||
169 | webpage = self._download_webpage(url, item_id) | |
170 | ||
171 | # course path | |
172 | if course_id: | |
29f7c58a | 173 | module = self._parse_json( |
174 | self._search_regex( | |
037cc66e | 175 | r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'), |
29f7c58a | 176 | item_id) |
177 | entries = [] | |
178 | chapter_number = None | |
179 | chapter = None | |
180 | chapter_id = None | |
181 | for item in module['items']: | |
182 | if not isinstance(item, dict): | |
183 | continue | |
184 | ||
185 | def type_field(key): | |
186 | return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower() | |
187 | type_fields = (type_field('name'), type_field('slug')) | |
188 | # Move to next module section | |
189 | if 'section' in type_fields: | |
190 | chapter = item.get('course_name') | |
191 | chapter_id = item.get('course_module') | |
192 | chapter_number = 1 if not chapter_number else chapter_number + 1 | |
193 | continue | |
194 | # Skip non-lessons | |
195 | if 'lesson' not in type_fields: | |
196 | continue | |
197 | lesson_url = urljoin(url, item.get('url')) | |
198 | if not lesson_url: | |
199 | continue | |
200 | title = item.get('title') or item.get('lesson_name') | |
201 | description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text')) | |
202 | entries.append({ | |
203 | '_type': 'url_transparent', | |
204 | 'url': lesson_url, | |
205 | 'ie_key': LinuxAcademyIE.ie_key(), | |
206 | 'title': title, | |
207 | 'description': description, | |
208 | 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')), | |
209 | 'duration': parse_duration(item.get('duration')), | |
210 | 'chapter': chapter, | |
211 | 'chapter_id': chapter_id, | |
212 | 'chapter_number': chapter_number, | |
213 | }) | |
214 | return { | |
215 | '_type': 'playlist', | |
216 | 'entries': entries, | |
217 | 'id': course_id, | |
218 | 'title': module.get('title'), | |
219 | 'description': module.get('md_desc') or clean_html(module.get('desc')), | |
220 | 'duration': parse_duration(module.get('duration')), | |
221 | } | |
659e93fc S |
222 | |
223 | # single video path | |
29f7c58a | 224 | m3u8_url = self._parse_json( |
225 | self._search_regex( | |
226 | r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'), | |
227 | item_id)[0]['file'] | |
228 | formats = self._extract_m3u8_formats( | |
229 | m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native', | |
230 | m3u8_id='hls') | |
231 | self._sort_formats(formats) | |
232 | info = { | |
659e93fc | 233 | 'id': item_id, |
29f7c58a | 234 | 'formats': formats, |
235 | } | |
236 | lesson = self._parse_json( | |
237 | self._search_regex( | |
238 | (r'window\.lesson\s*=\s*({.+?})\s*;', | |
239 | r'player\.lesson\s*=\s*({.+?})\s*;'), | |
240 | webpage, 'lesson', default='{}'), item_id, fatal=False) | |
241 | if lesson: | |
242 | info.update({ | |
243 | 'title': lesson.get('lesson_name'), | |
244 | 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')), | |
245 | 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')), | |
246 | 'duration': parse_duration(lesson.get('duration')), | |
247 | }) | |
248 | if not info.get('title'): | |
249 | info['title'] = self._search_regex( | |
250 | (r'>Lecture\s*:\s*(?P<value>[^<]+)', | |
251 | r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, | |
252 | 'title', group='value') | |
659e93fc | 253 | return info |