]>
Commit | Line | Data |
---|---|---|
1 | import collections | |
2 | import json | |
3 | import os | |
4 | import random | |
5 | import re | |
6 | ||
7 | from .common import InfoExtractor | |
8 | from ..compat import ( | |
9 | compat_str, | |
10 | compat_urlparse, | |
11 | ) | |
12 | from ..utils import ( | |
13 | dict_get, | |
14 | ExtractorError, | |
15 | float_or_none, | |
16 | int_or_none, | |
17 | parse_duration, | |
18 | parse_qs, | |
19 | qualities, | |
20 | srt_subtitles_timecode, | |
21 | try_get, | |
22 | update_url_query, | |
23 | urlencode_postdata, | |
24 | ) | |
25 | ||
26 | ||
27 | class PluralsightBaseIE(InfoExtractor): | |
28 | _API_BASE = 'https://app.pluralsight.com' | |
29 | ||
30 | _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE | |
31 | _GRAPHQL_HEADERS = { | |
32 | 'Content-Type': 'application/json;charset=UTF-8', | |
33 | } | |
34 | _GRAPHQL_COURSE_TMPL = ''' | |
35 | query BootstrapPlayer { | |
36 | rpc { | |
37 | bootstrapPlayer { | |
38 | profile { | |
39 | firstName | |
40 | lastName | |
41 | ||
42 | username | |
43 | userHandle | |
44 | authed | |
45 | isAuthed | |
46 | plan | |
47 | } | |
48 | course(courseId: "%s") { | |
49 | name | |
50 | title | |
51 | courseHasCaptions | |
52 | translationLanguages { | |
53 | code | |
54 | name | |
55 | } | |
56 | supportsWideScreenVideoFormats | |
57 | timestamp | |
58 | modules { | |
59 | name | |
60 | title | |
61 | duration | |
62 | formattedDuration | |
63 | author | |
64 | authorized | |
65 | clips { | |
66 | authorized | |
67 | clipId | |
68 | duration | |
69 | formattedDuration | |
70 | id | |
71 | index | |
72 | moduleIndex | |
73 | moduleTitle | |
74 | name | |
75 | title | |
76 | watched | |
77 | } | |
78 | } | |
79 | } | |
80 | } | |
81 | } | |
82 | }''' | |
83 | ||
84 | def _download_course(self, course_id, url, display_id): | |
85 | try: | |
86 | return self._download_course_rpc(course_id, url, display_id) | |
87 | except ExtractorError: | |
88 | # Old API fallback | |
89 | return self._download_json( | |
90 | 'https://app.pluralsight.com/player/user/api/v1/player/payload', | |
91 | display_id, data=urlencode_postdata({'courseId': course_id}), | |
92 | headers={'Referer': url}) | |
93 | ||
94 | def _download_course_rpc(self, course_id, url, display_id): | |
95 | response = self._download_json( | |
96 | self._GRAPHQL_EP, display_id, data=json.dumps({ | |
97 | 'query': self._GRAPHQL_COURSE_TMPL % course_id, | |
98 | 'variables': {} | |
99 | }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) | |
100 | ||
101 | course = try_get( | |
102 | response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], | |
103 | dict) | |
104 | if course: | |
105 | return course | |
106 | ||
107 | raise ExtractorError( | |
108 | '%s said: %s' % (self.IE_NAME, response['error']['message']), | |
109 | expected=True) | |
110 | ||
111 | ||
112 | class PluralsightIE(PluralsightBaseIE): | |
113 | IE_NAME = 'pluralsight' | |
114 | _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' | |
115 | _LOGIN_URL = 'https://app.pluralsight.com/id/' | |
116 | ||
117 | _NETRC_MACHINE = 'pluralsight' | |
118 | ||
119 | _TESTS = [{ | |
120 | 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', | |
121 | 'md5': '4d458cf5cf4c593788672419a8dd4cf8', | |
122 | 'info_dict': { | |
123 | 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', | |
124 | 'ext': 'mp4', | |
125 | 'title': 'Demo Monitoring', | |
126 | 'duration': 338, | |
127 | }, | |
128 | 'skip': 'Requires pluralsight account credentials', | |
129 | }, { | |
130 | 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', | |
131 | 'only_matching': True, | |
132 | }, { | |
133 | # available without pluralsight account | |
134 | 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', | |
135 | 'only_matching': True, | |
136 | }, { | |
137 | 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', | |
138 | 'only_matching': True, | |
139 | }] | |
140 | ||
141 | GRAPHQL_VIEWCLIP_TMPL = ''' | |
142 | query viewClip { | |
143 | viewClip(input: { | |
144 | author: "%(author)s", | |
145 | clipIndex: %(clipIndex)d, | |
146 | courseName: "%(courseName)s", | |
147 | includeCaptions: %(includeCaptions)s, | |
148 | locale: "%(locale)s", | |
149 | mediaType: "%(mediaType)s", | |
150 | moduleName: "%(moduleName)s", | |
151 | quality: "%(quality)s" | |
152 | }) { | |
153 | urls { | |
154 | url | |
155 | cdn | |
156 | rank | |
157 | source | |
158 | }, | |
159 | status | |
160 | } | |
161 | }''' | |
162 | ||
163 | def _perform_login(self, username, password): | |
164 | login_page = self._download_webpage( | |
165 | self._LOGIN_URL, None, 'Downloading login page') | |
166 | ||
167 | login_form = self._hidden_inputs(login_page) | |
168 | ||
169 | login_form.update({ | |
170 | 'Username': username, | |
171 | 'Password': password, | |
172 | }) | |
173 | ||
174 | post_url = self._search_regex( | |
175 | r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, | |
176 | 'post url', default=self._LOGIN_URL, group='url') | |
177 | ||
178 | if not post_url.startswith('http'): | |
179 | post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) | |
180 | ||
181 | response = self._download_webpage( | |
182 | post_url, None, 'Logging in', | |
183 | data=urlencode_postdata(login_form), | |
184 | headers={'Content-Type': 'application/x-www-form-urlencoded'}) | |
185 | ||
186 | error = self._search_regex( | |
187 | r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', | |
188 | response, 'error message', default=None) | |
189 | if error: | |
190 | raise ExtractorError('Unable to login: %s' % error, expected=True) | |
191 | ||
192 | if all(not re.search(p, response) for p in ( | |
193 | r'__INITIAL_STATE__', r'["\']currentUser["\']', | |
194 | # new layout? | |
195 | r'>\s*Sign out\s*<')): | |
196 | BLOCKED = 'Your account has been blocked due to suspicious activity' | |
197 | if BLOCKED in response: | |
198 | raise ExtractorError( | |
199 | 'Unable to login: %s' % BLOCKED, expected=True) | |
200 | MUST_AGREE = 'To continue using Pluralsight, you must agree to' | |
201 | if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): | |
202 | raise ExtractorError( | |
203 | 'Unable to login: %s some documents. Go to pluralsight.com, ' | |
204 | 'log in and agree with what Pluralsight requires.' | |
205 | % MUST_AGREE, expected=True) | |
206 | ||
207 | raise ExtractorError('Unable to log in') | |
208 | ||
209 | def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): | |
210 | captions = None | |
211 | if clip_id: | |
212 | captions = self._download_json( | |
213 | '%s/transcript/api/v1/caption/json/%s/%s' | |
214 | % (self._API_BASE, clip_id, lang), video_id, | |
215 | 'Downloading captions JSON', 'Unable to download captions JSON', | |
216 | fatal=False) | |
217 | if not captions: | |
218 | captions_post = { | |
219 | 'a': author, | |
220 | 'cn': int(clip_idx), | |
221 | 'lc': lang, | |
222 | 'm': name, | |
223 | } | |
224 | captions = self._download_json( | |
225 | '%s/player/retrieve-captions' % self._API_BASE, video_id, | |
226 | 'Downloading captions JSON', 'Unable to download captions JSON', | |
227 | fatal=False, data=json.dumps(captions_post).encode('utf-8'), | |
228 | headers={'Content-Type': 'application/json;charset=utf-8'}) | |
229 | if captions: | |
230 | return { | |
231 | lang: [{ | |
232 | 'ext': 'json', | |
233 | 'data': json.dumps(captions), | |
234 | }, { | |
235 | 'ext': 'srt', | |
236 | 'data': self._convert_subtitles(duration, captions), | |
237 | }] | |
238 | } | |
239 | ||
240 | @staticmethod | |
241 | def _convert_subtitles(duration, subs): | |
242 | srt = '' | |
243 | TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') | |
244 | TEXT_KEYS = ('text', 'Text') | |
245 | for num, current in enumerate(subs): | |
246 | current = subs[num] | |
247 | start, text = ( | |
248 | float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), | |
249 | dict_get(current, TEXT_KEYS)) | |
250 | if start is None or text is None: | |
251 | continue | |
252 | end = duration if num == len(subs) - 1 else float_or_none( | |
253 | dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) | |
254 | if end is None: | |
255 | continue | |
256 | srt += os.linesep.join( | |
257 | ( | |
258 | '%d' % num, | |
259 | '%s --> %s' % ( | |
260 | srt_subtitles_timecode(start), | |
261 | srt_subtitles_timecode(end)), | |
262 | text, | |
263 | os.linesep, | |
264 | )) | |
265 | return srt | |
266 | ||
267 | def _real_extract(self, url): | |
268 | qs = parse_qs(url) | |
269 | ||
270 | author = qs.get('author', [None])[0] | |
271 | name = qs.get('name', [None])[0] | |
272 | clip_idx = qs.get('clip', [None])[0] | |
273 | course_name = qs.get('course', [None])[0] | |
274 | ||
275 | if any(not f for f in (author, name, clip_idx, course_name,)): | |
276 | raise ExtractorError('Invalid URL', expected=True) | |
277 | ||
278 | display_id = '%s-%s' % (name, clip_idx) | |
279 | ||
280 | course = self._download_course(course_name, url, display_id) | |
281 | ||
282 | collection = course['modules'] | |
283 | ||
284 | clip = None | |
285 | ||
286 | for module_ in collection: | |
287 | if name in (module_.get('moduleName'), module_.get('name')): | |
288 | for clip_ in module_.get('clips', []): | |
289 | clip_index = clip_.get('clipIndex') | |
290 | if clip_index is None: | |
291 | clip_index = clip_.get('index') | |
292 | if clip_index is None: | |
293 | continue | |
294 | if compat_str(clip_index) == clip_idx: | |
295 | clip = clip_ | |
296 | break | |
297 | ||
298 | if not clip: | |
299 | raise ExtractorError('Unable to resolve clip') | |
300 | ||
301 | title = clip['title'] | |
302 | clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] | |
303 | ||
304 | QUALITIES = { | |
305 | 'low': {'width': 640, 'height': 480}, | |
306 | 'medium': {'width': 848, 'height': 640}, | |
307 | 'high': {'width': 1024, 'height': 768}, | |
308 | 'high-widescreen': {'width': 1280, 'height': 720}, | |
309 | } | |
310 | ||
311 | QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) | |
312 | quality_key = qualities(QUALITIES_PREFERENCE) | |
313 | ||
314 | AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) | |
315 | ||
316 | ALLOWED_QUALITIES = ( | |
317 | AllowedQuality('webm', ['high', ]), | |
318 | AllowedQuality('mp4', ['low', 'medium', 'high', ]), | |
319 | ) | |
320 | ||
321 | # Some courses also offer widescreen resolution for high quality (see | |
322 | # https://github.com/ytdl-org/youtube-dl/issues/7766) | |
323 | widescreen = course.get('supportsWideScreenVideoFormats') is True | |
324 | best_quality = 'high-widescreen' if widescreen else 'high' | |
325 | if widescreen: | |
326 | for allowed_quality in ALLOWED_QUALITIES: | |
327 | allowed_quality.qualities.append(best_quality) | |
328 | ||
329 | # In order to minimize the number of calls to ViewClip API and reduce | |
330 | # the probability of being throttled or banned by Pluralsight we will request | |
331 | # only single format until formats listing was explicitly requested. | |
332 | if self.get_param('listformats', False): | |
333 | allowed_qualities = ALLOWED_QUALITIES | |
334 | else: | |
335 | def guess_allowed_qualities(): | |
336 | req_format = self.get_param('format') or 'best' | |
337 | req_format_split = req_format.split('-', 1) | |
338 | if len(req_format_split) > 1: | |
339 | req_ext, req_quality = req_format_split | |
340 | req_quality = '-'.join(req_quality.split('-')[:2]) | |
341 | for allowed_quality in ALLOWED_QUALITIES: | |
342 | if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: | |
343 | return (AllowedQuality(req_ext, (req_quality, )), ) | |
344 | req_ext = 'webm' if self.get_param('prefer_free_formats') else 'mp4' | |
345 | return (AllowedQuality(req_ext, (best_quality, )), ) | |
346 | allowed_qualities = guess_allowed_qualities() | |
347 | ||
348 | formats = [] | |
349 | for ext, qualities_ in allowed_qualities: | |
350 | for quality in qualities_: | |
351 | f = QUALITIES[quality].copy() | |
352 | clip_post = { | |
353 | 'author': author, | |
354 | 'includeCaptions': 'false', | |
355 | 'clipIndex': int(clip_idx), | |
356 | 'courseName': course_name, | |
357 | 'locale': 'en', | |
358 | 'moduleName': name, | |
359 | 'mediaType': ext, | |
360 | 'quality': '%dx%d' % (f['width'], f['height']), | |
361 | } | |
362 | format_id = '%s-%s' % (ext, quality) | |
363 | ||
364 | try: | |
365 | viewclip = self._download_json( | |
366 | self._GRAPHQL_EP, display_id, | |
367 | 'Downloading %s viewclip graphql' % format_id, | |
368 | data=json.dumps({ | |
369 | 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, | |
370 | 'variables': {} | |
371 | }).encode('utf-8'), | |
372 | headers=self._GRAPHQL_HEADERS)['data']['viewClip'] | |
373 | except ExtractorError: | |
374 | # Still works but most likely will go soon | |
375 | viewclip = self._download_json( | |
376 | '%s/video/clips/viewclip' % self._API_BASE, display_id, | |
377 | 'Downloading %s viewclip JSON' % format_id, fatal=False, | |
378 | data=json.dumps(clip_post).encode('utf-8'), | |
379 | headers={'Content-Type': 'application/json;charset=utf-8'}) | |
380 | ||
381 | # Pluralsight tracks multiple sequential calls to ViewClip API and start | |
382 | # to return 429 HTTP errors after some time (see | |
383 | # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead | |
384 | # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842). | |
385 | # To somewhat reduce the probability of these consequences | |
386 | # we will sleep random amount of time before each call to ViewClip. | |
387 | self._sleep( | |
388 | random.randint(5, 10), display_id, | |
389 | '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') | |
390 | ||
391 | if not viewclip: | |
392 | continue | |
393 | ||
394 | clip_urls = viewclip.get('urls') | |
395 | if not isinstance(clip_urls, list): | |
396 | continue | |
397 | ||
398 | for clip_url_data in clip_urls: | |
399 | clip_url = clip_url_data.get('url') | |
400 | if not clip_url: | |
401 | continue | |
402 | cdn = clip_url_data.get('cdn') | |
403 | clip_f = f.copy() | |
404 | clip_f.update({ | |
405 | 'url': clip_url, | |
406 | 'ext': ext, | |
407 | 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, | |
408 | 'quality': quality_key(quality), | |
409 | 'source_preference': int_or_none(clip_url_data.get('rank')), | |
410 | }) | |
411 | formats.append(clip_f) | |
412 | ||
413 | duration = int_or_none( | |
414 | clip.get('duration')) or parse_duration(clip.get('formattedDuration')) | |
415 | ||
416 | # TODO: other languages? | |
417 | subtitles = self.extract_subtitles( | |
418 | author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) | |
419 | ||
420 | return { | |
421 | 'id': clip_id, | |
422 | 'title': title, | |
423 | 'duration': duration, | |
424 | 'creator': author, | |
425 | 'formats': formats, | |
426 | 'subtitles': subtitles, | |
427 | } | |
428 | ||
429 | ||
430 | class PluralsightCourseIE(PluralsightBaseIE): | |
431 | IE_NAME = 'pluralsight:course' | |
432 | _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)' | |
433 | _TESTS = [{ | |
434 | # Free course from Pluralsight Starter Subscription for Microsoft TechNet | |
435 | # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz | |
436 | 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', | |
437 | 'info_dict': { | |
438 | 'id': 'hosting-sql-server-windows-azure-iaas', | |
439 | 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', | |
440 | 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', | |
441 | }, | |
442 | 'playlist_count': 31, | |
443 | }, { | |
444 | # available without pluralsight account | |
445 | 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', | |
446 | 'only_matching': True, | |
447 | }, { | |
448 | 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', | |
449 | 'only_matching': True, | |
450 | }] | |
451 | ||
452 | def _real_extract(self, url): | |
453 | course_id = self._match_id(url) | |
454 | ||
455 | # TODO: PSM cookie | |
456 | ||
457 | course = self._download_course(course_id, url, course_id) | |
458 | ||
459 | title = course['title'] | |
460 | course_name = course['name'] | |
461 | course_data = course['modules'] | |
462 | description = course.get('description') or course.get('shortDescription') | |
463 | ||
464 | entries = [] | |
465 | for num, module in enumerate(course_data, 1): | |
466 | author = module.get('author') | |
467 | module_name = module.get('name') | |
468 | if not author or not module_name: | |
469 | continue | |
470 | for clip in module.get('clips', []): | |
471 | clip_index = int_or_none(clip.get('index')) | |
472 | if clip_index is None: | |
473 | continue | |
474 | clip_url = update_url_query( | |
475 | '%s/player' % self._API_BASE, query={ | |
476 | 'mode': 'live', | |
477 | 'course': course_name, | |
478 | 'author': author, | |
479 | 'name': module_name, | |
480 | 'clip': clip_index, | |
481 | }) | |
482 | entries.append({ | |
483 | '_type': 'url_transparent', | |
484 | 'url': clip_url, | |
485 | 'ie_key': PluralsightIE.ie_key(), | |
486 | 'chapter': module.get('title'), | |
487 | 'chapter_number': num, | |
488 | 'chapter_id': module.get('moduleRef'), | |
489 | }) | |
490 | ||
491 | return self.playlist_result(entries, course_id, title, description) |