]>
Commit | Line | Data |
---|---|---|
8c3e35dd | 1 | import collections |
483fc223 | 2 | import json |
8c3e35dd | 3 | import os |
38eb2968 | 4 | import random |
21c1a00d | 5 | import re |
add96eb9 | 6 | import urllib.parse |
483fc223 S |
7 | |
8 | from .common import InfoExtractor | |
483fc223 S |
9 | from ..utils import ( |
10 | ExtractorError, | |
e897bd82 | 11 | dict_get, |
8c3e35dd | 12 | float_or_none, |
483fc223 S |
13 | int_or_none, |
14 | parse_duration, | |
4dfbf869 | 15 | parse_qs, |
756926ff | 16 | qualities, |
8c3e35dd | 17 | srt_subtitles_timecode, |
93d0583e | 18 | try_get, |
3d7e3aaa | 19 | update_url_query, |
6e6bc8da | 20 | urlencode_postdata, |
483fc223 S |
21 | ) |
22 | ||
23 | ||
563772ed | 24 | class PluralsightBaseIE(InfoExtractor): |
9df6b03c | 25 | _API_BASE = 'https://app.pluralsight.com' |
563772ed | 26 | |
add96eb9 | 27 | _GRAPHQL_EP = f'{_API_BASE}/player/api/graphql' |
836ef484 S |
28 | _GRAPHQL_HEADERS = { |
29 | 'Content-Type': 'application/json;charset=UTF-8', | |
30 | } | |
31 | _GRAPHQL_COURSE_TMPL = ''' | |
32 | query BootstrapPlayer { | |
33 | rpc { | |
34 | bootstrapPlayer { | |
35 | profile { | |
36 | firstName | |
37 | lastName | |
38 | ||
39 | username | |
40 | userHandle | |
41 | authed | |
42 | isAuthed | |
43 | plan | |
44 | } | |
45 | course(courseId: "%s") { | |
46 | name | |
47 | title | |
48 | courseHasCaptions | |
49 | translationLanguages { | |
50 | code | |
51 | name | |
52 | } | |
53 | supportsWideScreenVideoFormats | |
54 | timestamp | |
55 | modules { | |
56 | name | |
57 | title | |
58 | duration | |
59 | formattedDuration | |
60 | author | |
61 | authorized | |
62 | clips { | |
63 | authorized | |
64 | clipId | |
65 | duration | |
66 | formattedDuration | |
67 | id | |
68 | index | |
69 | moduleIndex | |
70 | moduleTitle | |
71 | name | |
72 | title | |
73 | watched | |
74 | } | |
75 | } | |
76 | } | |
77 | } | |
78 | } | |
79 | }''' | |
80 | ||
93d0583e S |
81 | def _download_course(self, course_id, url, display_id): |
82 | try: | |
83 | return self._download_course_rpc(course_id, url, display_id) | |
84 | except ExtractorError: | |
85 | # Old API fallback | |
86 | return self._download_json( | |
87 | 'https://app.pluralsight.com/player/user/api/v1/player/payload', | |
88 | display_id, data=urlencode_postdata({'courseId': course_id}), | |
89 | headers={'Referer': url}) | |
90 | ||
91 | def _download_course_rpc(self, course_id, url, display_id): | |
92 | response = self._download_json( | |
836ef484 S |
93 | self._GRAPHQL_EP, display_id, data=json.dumps({ |
94 | 'query': self._GRAPHQL_COURSE_TMPL % course_id, | |
add96eb9 | 95 | 'variables': {}, |
96 | }).encode(), headers=self._GRAPHQL_HEADERS) | |
836ef484 S |
97 | |
98 | course = try_get( | |
99 | response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], | |
100 | dict) | |
93d0583e S |
101 | if course: |
102 | return course | |
103 | ||
104 | raise ExtractorError( | |
add96eb9 | 105 | '{} said: {}'.format(self.IE_NAME, response['error']['message']), |
93d0583e S |
106 | expected=True) |
107 | ||
563772ed S |
108 | |
109 | class PluralsightIE(PluralsightBaseIE): | |
483fc223 | 110 | IE_NAME = 'pluralsight' |
b0dfcab6 | 111 | _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' |
c3a227d1 | 112 | _LOGIN_URL = 'https://app.pluralsight.com/id/' |
563772ed | 113 | |
483fc223 S |
114 | _NETRC_MACHINE = 'pluralsight' |
115 | ||
71bd93b8 | 116 | _TESTS = [{ |
483fc223 S |
117 | 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', |
118 | 'md5': '4d458cf5cf4c593788672419a8dd4cf8', | |
119 | 'info_dict': { | |
120 | 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', | |
121 | 'ext': 'mp4', | |
8e5a7c5e | 122 | 'title': 'Demo Monitoring', |
483fc223 S |
123 | 'duration': 338, |
124 | }, | |
125 | 'skip': 'Requires pluralsight account credentials', | |
71bd93b8 S |
126 | }, { |
127 | 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', | |
128 | 'only_matching': True, | |
c23e2664 S |
129 | }, { |
130 | # available without pluralsight account | |
131 | 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', | |
132 | 'only_matching': True, | |
b0dfcab6 S |
133 | }, { |
134 | 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', | |
135 | 'only_matching': True, | |
71bd93b8 | 136 | }] |
483fc223 | 137 | |
836ef484 S |
138 | GRAPHQL_VIEWCLIP_TMPL = ''' |
139 | query viewClip { | |
140 | viewClip(input: { | |
141 | author: "%(author)s", | |
142 | clipIndex: %(clipIndex)d, | |
143 | courseName: "%(courseName)s", | |
144 | includeCaptions: %(includeCaptions)s, | |
145 | locale: "%(locale)s", | |
146 | mediaType: "%(mediaType)s", | |
147 | moduleName: "%(moduleName)s", | |
148 | quality: "%(quality)s" | |
149 | }) { | |
150 | urls { | |
151 | url | |
152 | cdn | |
153 | rank | |
154 | source | |
155 | }, | |
156 | status | |
157 | } | |
158 | }''' | |
159 | ||
52efa4b3 | 160 | def _perform_login(self, username, password): |
483fc223 S |
161 | login_page = self._download_webpage( |
162 | self._LOGIN_URL, None, 'Downloading login page') | |
163 | ||
164 | login_form = self._hidden_inputs(login_page) | |
165 | ||
166 | login_form.update({ | |
244cd042 S |
167 | 'Username': username, |
168 | 'Password': password, | |
483fc223 S |
169 | }) |
170 | ||
171 | post_url = self._search_regex( | |
172 | r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, | |
173 | 'post url', default=self._LOGIN_URL, group='url') | |
174 | ||
175 | if not post_url.startswith('http'): | |
add96eb9 | 176 | post_url = urllib.parse.urljoin(self._LOGIN_URL, post_url) |
483fc223 | 177 | |
483fc223 | 178 | response = self._download_webpage( |
e4d95865 | 179 | post_url, None, 'Logging in', |
30317f48 S |
180 | data=urlencode_postdata(login_form), |
181 | headers={'Content-Type': 'application/x-www-form-urlencoded'}) | |
483fc223 S |
182 | |
183 | error = self._search_regex( | |
184 | r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', | |
185 | response, 'error message', default=None) | |
186 | if error: | |
add96eb9 | 187 | raise ExtractorError(f'Unable to login: {error}', expected=True) |
483fc223 | 188 | |
21c1a00d S |
189 | if all(not re.search(p, response) for p in ( |
190 | r'__INITIAL_STATE__', r'["\']currentUser["\']', | |
191 | # new layout? | |
192 | r'>\s*Sign out\s*<')): | |
9dd5408c S |
193 | BLOCKED = 'Your account has been blocked due to suspicious activity' |
194 | if BLOCKED in response: | |
195 | raise ExtractorError( | |
add96eb9 | 196 | f'Unable to login: {BLOCKED}', expected=True) |
c94427dd S |
197 | MUST_AGREE = 'To continue using Pluralsight, you must agree to' |
198 | if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): | |
199 | raise ExtractorError( | |
add96eb9 | 200 | f'Unable to login: {MUST_AGREE} some documents. Go to pluralsight.com, ' |
201 | 'log in and agree with what Pluralsight requires.', expected=True) | |
c94427dd | 202 | |
7e508ff2 S |
203 | raise ExtractorError('Unable to log in') |
204 | ||
36534313 S |
205 | def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): |
206 | captions = None | |
207 | if clip_id: | |
208 | captions = self._download_json( | |
add96eb9 | 209 | f'{self._API_BASE}/transcript/api/v1/caption/json/{clip_id}/{lang}', video_id, |
36534313 S |
210 | 'Downloading captions JSON', 'Unable to download captions JSON', |
211 | fatal=False) | |
212 | if not captions: | |
213 | captions_post = { | |
214 | 'a': author, | |
215 | 'cn': int(clip_idx), | |
216 | 'lc': lang, | |
217 | 'm': name, | |
218 | } | |
219 | captions = self._download_json( | |
add96eb9 | 220 | f'{self._API_BASE}/player/retrieve-captions', video_id, |
36534313 | 221 | 'Downloading captions JSON', 'Unable to download captions JSON', |
add96eb9 | 222 | fatal=False, data=json.dumps(captions_post).encode(), |
36534313 | 223 | headers={'Content-Type': 'application/json;charset=utf-8'}) |
8c3e35dd S |
224 | if captions: |
225 | return { | |
226 | lang: [{ | |
227 | 'ext': 'json', | |
228 | 'data': json.dumps(captions), | |
229 | }, { | |
230 | 'ext': 'srt', | |
231 | 'data': self._convert_subtitles(duration, captions), | |
add96eb9 | 232 | }], |
8c3e35dd S |
233 | } |
234 | ||
235 | @staticmethod | |
236 | def _convert_subtitles(duration, subs): | |
237 | srt = '' | |
425f3fdf S |
238 | TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') |
239 | TEXT_KEYS = ('text', 'Text') | |
8c3e35dd S |
240 | for num, current in enumerate(subs): |
241 | current = subs[num] | |
425f3fdf | 242 | start, text = ( |
2c8e11b4 | 243 | float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), |
425f3fdf | 244 | dict_get(current, TEXT_KEYS)) |
8c3e35dd S |
245 | if start is None or text is None: |
246 | continue | |
247 | end = duration if num == len(subs) - 1 else float_or_none( | |
2c8e11b4 | 248 | dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) |
30317f48 S |
249 | if end is None: |
250 | continue | |
8c3e35dd S |
251 | srt += os.linesep.join( |
252 | ( | |
add96eb9 | 253 | f'{num}', |
254 | f'{srt_subtitles_timecode(start)} --> {srt_subtitles_timecode(end)}', | |
8c3e35dd S |
255 | text, |
256 | os.linesep, | |
257 | )) | |
258 | return srt | |
259 | ||
483fc223 | 260 | def _real_extract(self, url): |
4dfbf869 | 261 | qs = parse_qs(url) |
71bd93b8 S |
262 | |
263 | author = qs.get('author', [None])[0] | |
264 | name = qs.get('name', [None])[0] | |
a3f86160 | 265 | clip_idx = qs.get('clip', [None])[0] |
9df6b03c | 266 | course_name = qs.get('course', [None])[0] |
71bd93b8 | 267 | |
add96eb9 | 268 | if any(not f for f in (author, name, clip_idx, course_name)): |
71bd93b8 | 269 | raise ExtractorError('Invalid URL', expected=True) |
483fc223 | 270 | |
add96eb9 | 271 | display_id = f'{name}-{clip_idx}' |
483fc223 | 272 | |
93d0583e | 273 | course = self._download_course(course_name, url, display_id) |
9df6b03c S |
274 | |
275 | collection = course['modules'] | |
483fc223 | 276 | |
d212c93d | 277 | clip = None |
483fc223 S |
278 | |
279 | for module_ in collection: | |
02f0da20 | 280 | if name in (module_.get('moduleName'), module_.get('name')): |
483fc223 S |
281 | for clip_ in module_.get('clips', []): |
282 | clip_index = clip_.get('clipIndex') | |
02f0da20 S |
283 | if clip_index is None: |
284 | clip_index = clip_.get('index') | |
483fc223 S |
285 | if clip_index is None: |
286 | continue | |
add96eb9 | 287 | if str(clip_index) == clip_idx: |
483fc223 S |
288 | clip = clip_ |
289 | break | |
290 | ||
291 | if not clip: | |
292 | raise ExtractorError('Unable to resolve clip') | |
293 | ||
8e5a7c5e | 294 | title = clip['title'] |
a3f86160 | 295 | clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] |
8c3e35dd | 296 | |
483fc223 S |
297 | QUALITIES = { |
298 | 'low': {'width': 640, 'height': 480}, | |
299 | 'medium': {'width': 848, 'height': 640}, | |
300 | 'high': {'width': 1024, 'height': 768}, | |
756926ff | 301 | 'high-widescreen': {'width': 1280, 'height': 720}, |
483fc223 S |
302 | } |
303 | ||
add96eb9 | 304 | QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen') |
756926ff S |
305 | quality_key = qualities(QUALITIES_PREFERENCE) |
306 | ||
4c57b485 S |
307 | AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) |
308 | ||
483fc223 | 309 | ALLOWED_QUALITIES = ( |
add96eb9 | 310 | AllowedQuality('webm', ['high']), |
311 | AllowedQuality('mp4', ['low', 'medium', 'high']), | |
483fc223 S |
312 | ) |
313 | ||
756926ff | 314 | # Some courses also offer widescreen resolution for high quality (see |
067aa17e | 315 | # https://github.com/ytdl-org/youtube-dl/issues/7766) |
9df6b03c | 316 | widescreen = course.get('supportsWideScreenVideoFormats') is True |
756926ff S |
317 | best_quality = 'high-widescreen' if widescreen else 'high' |
318 | if widescreen: | |
319 | for allowed_quality in ALLOWED_QUALITIES: | |
320 | allowed_quality.qualities.append(best_quality) | |
321 | ||
cf186b77 S |
322 | # In order to minimize the number of calls to ViewClip API and reduce |
323 | # the probability of being throttled or banned by Pluralsight we will request | |
0eebf34d | 324 | # only single format until formats listing was explicitly requested. |
a06916d9 | 325 | if self.get_param('listformats', False): |
4c57b485 S |
326 | allowed_qualities = ALLOWED_QUALITIES |
327 | else: | |
328 | def guess_allowed_qualities(): | |
a06916d9 | 329 | req_format = self.get_param('format') or 'best' |
edc70f4a | 330 | req_format_split = req_format.split('-', 1) |
4c57b485 S |
331 | if len(req_format_split) > 1: |
332 | req_ext, req_quality = req_format_split | |
fac188c6 | 333 | req_quality = '-'.join(req_quality.split('-')[:2]) |
4c57b485 S |
334 | for allowed_quality in ALLOWED_QUALITIES: |
335 | if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: | |
336 | return (AllowedQuality(req_ext, (req_quality, )), ) | |
a06916d9 | 337 | req_ext = 'webm' if self.get_param('prefer_free_formats') else 'mp4' |
756926ff | 338 | return (AllowedQuality(req_ext, (best_quality, )), ) |
4c57b485 S |
339 | allowed_qualities = guess_allowed_qualities() |
340 | ||
483fc223 | 341 | formats = [] |
756926ff S |
342 | for ext, qualities_ in allowed_qualities: |
343 | for quality in qualities_: | |
483fc223 S |
344 | f = QUALITIES[quality].copy() |
345 | clip_post = { | |
9df6b03c | 346 | 'author': author, |
836ef484 | 347 | 'includeCaptions': 'false', |
a3f86160 | 348 | 'clipIndex': int(clip_idx), |
9df6b03c S |
349 | 'courseName': course_name, |
350 | 'locale': 'en', | |
351 | 'moduleName': name, | |
352 | 'mediaType': ext, | |
353 | 'quality': '%dx%d' % (f['width'], f['height']), | |
483fc223 | 354 | } |
add96eb9 | 355 | format_id = f'{ext}-{quality}' |
836ef484 S |
356 | |
357 | try: | |
358 | viewclip = self._download_json( | |
359 | self._GRAPHQL_EP, display_id, | |
add96eb9 | 360 | f'Downloading {format_id} viewclip graphql', |
836ef484 S |
361 | data=json.dumps({ |
362 | 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, | |
add96eb9 | 363 | 'variables': {}, |
364 | }).encode(), | |
836ef484 S |
365 | headers=self._GRAPHQL_HEADERS)['data']['viewClip'] |
366 | except ExtractorError: | |
367 | # Still works but most likely will go soon | |
368 | viewclip = self._download_json( | |
add96eb9 | 369 | f'{self._API_BASE}/video/clips/viewclip', display_id, |
370 | f'Downloading {format_id} viewclip JSON', fatal=False, | |
371 | data=json.dumps(clip_post).encode(), | |
836ef484 | 372 | headers={'Content-Type': 'application/json;charset=utf-8'}) |
38eb2968 S |
373 | |
374 | # Pluralsight tracks multiple sequential calls to ViewClip API and start | |
375 | # to return 429 HTTP errors after some time (see | |
067aa17e S |
376 | # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead |
377 | # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842). | |
38eb2968 S |
378 | # To somewhat reduce the probability of these consequences |
379 | # we will sleep random amount of time before each call to ViewClip. | |
380 | self._sleep( | |
201c1459 | 381 | random.randint(5, 10), display_id, |
38eb2968 S |
382 | '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') |
383 | ||
f8ae2c7f | 384 | if not viewclip: |
483fc223 | 385 | continue |
f8ae2c7f S |
386 | |
387 | clip_urls = viewclip.get('urls') | |
388 | if not isinstance(clip_urls, list): | |
389 | continue | |
390 | ||
391 | for clip_url_data in clip_urls: | |
392 | clip_url = clip_url_data.get('url') | |
393 | if not clip_url: | |
394 | continue | |
395 | cdn = clip_url_data.get('cdn') | |
396 | clip_f = f.copy() | |
397 | clip_f.update({ | |
398 | 'url': clip_url, | |
399 | 'ext': ext, | |
add96eb9 | 400 | 'format_id': f'{format_id}-{cdn}' if cdn else format_id, |
f8ae2c7f S |
401 | 'quality': quality_key(quality), |
402 | 'source_preference': int_or_none(clip_url_data.get('rank')), | |
403 | }) | |
404 | formats.append(clip_f) | |
405 | ||
8c3e35dd S |
406 | duration = int_or_none( |
407 | clip.get('duration')) or parse_duration(clip.get('formattedDuration')) | |
408 | ||
409 | # TODO: other languages? | |
410 | subtitles = self.extract_subtitles( | |
36534313 | 411 | author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) |
483fc223 S |
412 | |
413 | return { | |
a3f86160 | 414 | 'id': clip_id, |
8c3e35dd S |
415 | 'title': title, |
416 | 'duration': duration, | |
483fc223 | 417 | 'creator': author, |
8c3e35dd S |
418 | 'formats': formats, |
419 | 'subtitles': subtitles, | |
483fc223 S |
420 | } |
421 | ||
422 | ||
563772ed | 423 | class PluralsightCourseIE(PluralsightBaseIE): |
483fc223 | 424 | IE_NAME = 'pluralsight:course' |
a5cd0eb8 | 425 | _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)' |
c23e2664 | 426 | _TESTS = [{ |
483fc223 S |
427 | # Free course from Pluralsight Starter Subscription for Microsoft TechNet |
428 | # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz | |
429 | 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', | |
430 | 'info_dict': { | |
431 | 'id': 'hosting-sql-server-windows-azure-iaas', | |
432 | 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', | |
433 | 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', | |
434 | }, | |
435 | 'playlist_count': 31, | |
c23e2664 S |
436 | }, { |
437 | # available without pluralsight account | |
438 | 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', | |
439 | 'only_matching': True, | |
a5cd0eb8 S |
440 | }, { |
441 | 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', | |
442 | 'only_matching': True, | |
c23e2664 | 443 | }] |
483fc223 S |
444 | |
445 | def _real_extract(self, url): | |
446 | course_id = self._match_id(url) | |
447 | ||
2b6bda1e S |
448 | # TODO: PSM cookie |
449 | ||
93d0583e | 450 | course = self._download_course(course_id, url, course_id) |
483fc223 S |
451 | |
452 | title = course['title'] | |
3d7e3aaa S |
453 | course_name = course['name'] |
454 | course_data = course['modules'] | |
483fc223 S |
455 | description = course.get('description') or course.get('shortDescription') |
456 | ||
483fc223 | 457 | entries = [] |
8018028d | 458 | for num, module in enumerate(course_data, 1): |
3d7e3aaa S |
459 | author = module.get('author') |
460 | module_name = module.get('name') | |
461 | if not author or not module_name: | |
462 | continue | |
483fc223 | 463 | for clip in module.get('clips', []): |
3d7e3aaa S |
464 | clip_index = int_or_none(clip.get('index')) |
465 | if clip_index is None: | |
483fc223 | 466 | continue |
3d7e3aaa | 467 | clip_url = update_url_query( |
add96eb9 | 468 | f'{self._API_BASE}/player', query={ |
3d7e3aaa S |
469 | 'mode': 'live', |
470 | 'course': course_name, | |
471 | 'author': author, | |
472 | 'name': module_name, | |
473 | 'clip': clip_index, | |
474 | }) | |
8018028d S |
475 | entries.append({ |
476 | '_type': 'url_transparent', | |
3d7e3aaa | 477 | 'url': clip_url, |
8018028d S |
478 | 'ie_key': PluralsightIE.ie_key(), |
479 | 'chapter': module.get('title'), | |
480 | 'chapter_number': num, | |
481 | 'chapter_id': module.get('moduleRef'), | |
482 | }) | |
483fc223 | 483 | |
483fc223 | 484 | return self.playlist_result(entries, course_id, title, description) |