]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/slideslive.py
[extractor/slideslive] Fix slides and chapters/duration (#6024)
[yt-dlp.git] / yt_dlp / extractor / slideslive.py
CommitLineData
3d667e00 1import re
2import urllib.parse
3
d0f2d641 4from .common import InfoExtractor
29f7c58a 5from ..utils import (
3d667e00 6 ExtractorError,
7 int_or_none,
8 parse_qs,
29f7c58a 9 smuggle_url,
f69b0554 10 traverse_obj,
11 unified_timestamp,
3d667e00 12 update_url_query,
29f7c58a 13 url_or_none,
3d667e00 14 xpath_text,
29f7c58a 15)
d0f2d641
JW
16
17
18class SlidesLiveIE(InfoExtractor):
3d667e00 19 _VALID_URL = r'https?://slideslive\.com/(?:embed/(?:presentation/)?)?(?P<id>[0-9]+)'
d0f2d641 20 _TESTS = [{
3d667e00 21 # service_name = yoda, only XML slides info
d0f2d641 22 'url': 'https://slideslive.com/38902413/gcc-ia16-backend',
d0f2d641 23 'info_dict': {
f69b0554 24 'id': '38902413',
d0f2d641 25 'ext': 'mp4',
b33a05d2 26 'title': 'GCC IA16 backend',
f69b0554 27 'timestamp': 1648189972,
28 'upload_date': '20220325',
29 'thumbnail': r're:^https?://.*\.jpg',
3d667e00 30 'thumbnails': 'count:42',
31 'chapters': 'count:41',
5ab3534d 32 'duration': 1638,
f69b0554 33 },
34 'params': {
35 'skip_download': 'm3u8',
36 },
29f7c58a 37 }, {
3d667e00 38 # service_name = yoda, /v7/ slides
29f7c58a 39 'url': 'https://slideslive.com/38935785',
29f7c58a 40 'info_dict': {
f69b0554 41 'id': '38935785',
29f7c58a 42 'ext': 'mp4',
43 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges',
f69b0554 44 'upload_date': '20211115',
45 'timestamp': 1636996003,
3d667e00 46 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
47 'thumbnails': 'count:640',
48 'chapters': 'count:639',
5ab3534d 49 'duration': 9832,
f69b0554 50 },
51 'params': {
52 'skip_download': 'm3u8',
53 },
54 }, {
3d667e00 55 # service_name = yoda, /v1/ slides
f69b0554 56 'url': 'https://slideslive.com/38973182/how-should-a-machine-learning-researcher-think-about-ai-ethics',
57 'info_dict': {
58 'id': '38973182',
59 'ext': 'mp4',
60 'title': 'How Should a Machine Learning Researcher Think About AI Ethics?',
61 'upload_date': '20220201',
62 'thumbnail': r're:^https?://.*\.jpg',
63 'timestamp': 1643728135,
3d667e00 64 'thumbnails': 'count:3',
65 'chapters': 'count:2',
5ab3534d 66 'duration': 5889,
f69b0554 67 },
68 'params': {
69 'skip_download': 'm3u8',
29f7c58a 70 },
aa1d5eb9 71 }, {
3d667e00 72 # service_name = youtube, only XML slides info
f69b0554 73 'url': 'https://slideslive.com/38897546/special-metaprednaska-petra-ludwiga-hodnoty-pro-lepsi-spolecnost',
74 'md5': '8a79b5e3d700837f40bd2afca3c8fa01',
75 'info_dict': {
76 'id': 'jmg02wCJD5M',
77 'display_id': '38897546',
78 'ext': 'mp4',
79 'title': 'SPECIÁL: Meta-přednáška Petra Ludwiga - Hodnoty pro lepší společnost',
80 'description': 'Watch full version of this video at https://slideslive.com/38897546.',
81 'channel_url': 'https://www.youtube.com/channel/UCZWdAkNYFncuX0khyvhqnxw',
82 'channel': 'SlidesLive Videos - G1',
83 'channel_id': 'UCZWdAkNYFncuX0khyvhqnxw',
84 'uploader_id': 'UCZWdAkNYFncuX0khyvhqnxw',
85 'uploader': 'SlidesLive Videos - G1',
86 'uploader_url': 'http://www.youtube.com/channel/UCZWdAkNYFncuX0khyvhqnxw',
87 'live_status': 'not_live',
88 'upload_date': '20160710',
89 'timestamp': 1618786715,
90 'duration': 6827,
91 'like_count': int,
92 'view_count': int,
93 'comment_count': int,
94 'channel_follower_count': int,
95 'age_limit': 0,
3d667e00 96 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
97 'thumbnails': 'count:169',
f69b0554 98 'playable_in_embed': True,
99 'availability': 'unlisted',
100 'tags': [],
101 'categories': ['People & Blogs'],
3d667e00 102 'chapters': 'count:168',
103 },
104 }, {
105 # embed-only presentation, only XML slides info
106 'url': 'https://slideslive.com/embed/presentation/38925850',
107 'info_dict': {
108 'id': '38925850',
109 'ext': 'mp4',
110 'title': 'Towards a Deep Network Architecture for Structured Smoothness',
111 'thumbnail': r're:^https?://.*\.jpg',
112 'thumbnails': 'count:8',
113 'timestamp': 1629671508,
114 'upload_date': '20210822',
115 'chapters': 'count:7',
5ab3534d 116 'duration': 326,
3d667e00 117 },
118 'params': {
119 'skip_download': 'm3u8',
f69b0554 120 },
121 }, {
3d667e00 122 # embed-only presentation, only JSON slides info, /v5/ slides (.png)
123 'url': 'https://slideslive.com/38979920/',
124 'info_dict': {
125 'id': '38979920',
126 'ext': 'mp4',
127 'title': 'MoReL: Multi-omics Relational Learning',
128 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
129 'thumbnails': 'count:7',
130 'timestamp': 1654714970,
131 'upload_date': '20220608',
132 'chapters': 'count:6',
5ab3534d 133 'duration': 171,
3d667e00 134 },
135 'params': {
136 'skip_download': 'm3u8',
137 },
138 }, {
139 # /v2/ slides (.jpg)
140 'url': 'https://slideslive.com/38954074',
141 'info_dict': {
142 'id': '38954074',
143 'ext': 'mp4',
144 'title': 'Decentralized Attribution of Generative Models',
145 'thumbnail': r're:^https?://.*\.jpg',
146 'thumbnails': 'count:16',
147 'timestamp': 1622806321,
148 'upload_date': '20210604',
149 'chapters': 'count:15',
5ab3534d 150 'duration': 306,
3d667e00 151 },
152 'params': {
153 'skip_download': 'm3u8',
154 },
155 }, {
156 # /v4/ slides (.png)
157 'url': 'https://slideslive.com/38979570/',
158 'info_dict': {
159 'id': '38979570',
160 'ext': 'mp4',
161 'title': 'Efficient Active Search for Combinatorial Optimization Problems',
162 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
163 'thumbnails': 'count:9',
164 'timestamp': 1654714896,
165 'upload_date': '20220608',
166 'chapters': 'count:8',
5ab3534d 167 'duration': 295,
3d667e00 168 },
169 'params': {
170 'skip_download': 'm3u8',
171 },
172 }, {
173 # /v10/ slides
174 'url': 'https://slideslive.com/embed/presentation/38979880?embed_parent_url=https%3A%2F%2Fedit.videoken.com%2F',
175 'info_dict': {
176 'id': '38979880',
177 'ext': 'mp4',
178 'title': 'The Representation Power of Neural Networks',
179 'timestamp': 1654714962,
180 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
181 'thumbnails': 'count:22',
182 'upload_date': '20220608',
183 'chapters': 'count:21',
5ab3534d 184 'duration': 294,
3d667e00 185 },
186 'params': {
187 'skip_download': 'm3u8',
188 },
189 }, {
190 # /v7/ slides, 2 video slides
191 'url': 'https://slideslive.com/embed/presentation/38979682?embed_container_origin=https%3A%2F%2Fedit.videoken.com',
192 'playlist_count': 3,
193 'info_dict': {
194 'id': '38979682-playlist',
195 'title': 'LoRA: Low-Rank Adaptation of Large Language Models',
196 },
197 'playlist': [{
198 'info_dict': {
199 'id': '38979682',
200 'ext': 'mp4',
201 'title': 'LoRA: Low-Rank Adaptation of Large Language Models',
202 'timestamp': 1654714920,
203 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
204 'thumbnails': 'count:30',
205 'upload_date': '20220608',
206 'chapters': 'count:31',
5ab3534d 207 'duration': 272,
3d667e00 208 },
209 }, {
210 'info_dict': {
211 'id': '38979682-021',
212 'ext': 'mp4',
213 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 021',
214 'duration': 3,
215 'timestamp': 1654714920,
216 'upload_date': '20220608',
217 },
218 }, {
219 'info_dict': {
220 'id': '38979682-024',
221 'ext': 'mp4',
222 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 024',
223 'duration': 4,
224 'timestamp': 1654714920,
225 'upload_date': '20220608',
226 },
227 }],
228 'params': {
229 'skip_download': 'm3u8',
230 },
231 }, {
232 # /v6/ slides, 1 video slide, edit.videoken.com embed
233 'url': 'https://slideslive.com/38979481/',
234 'playlist_count': 2,
235 'info_dict': {
236 'id': '38979481-playlist',
237 'title': 'How to Train Your MAML to Excel in Few-Shot Classification',
238 },
239 'playlist': [{
240 'info_dict': {
241 'id': '38979481',
242 'ext': 'mp4',
243 'title': 'How to Train Your MAML to Excel in Few-Shot Classification',
244 'timestamp': 1654714877,
245 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
246 'thumbnails': 'count:43',
247 'upload_date': '20220608',
248 'chapters': 'count:43',
5ab3534d 249 'duration': 315,
3d667e00 250 },
251 }, {
252 'info_dict': {
253 'id': '38979481-013',
254 'ext': 'mp4',
255 'title': 'How to Train Your MAML to Excel in Few-Shot Classification - Slide 013',
256 'duration': 3,
257 'timestamp': 1654714877,
258 'upload_date': '20220608',
259 },
260 }],
261 'params': {
262 'skip_download': 'm3u8',
263 },
264 }, {
265 # /v3/ slides, .jpg and .png, service_name = youtube
266 'url': 'https://slideslive.com/embed/38932460/',
267 'info_dict': {
268 'id': 'RTPdrgkyTiE',
269 'display_id': '38932460',
270 'ext': 'mp4',
271 'title': 'Active Learning for Hierarchical Multi-Label Classification',
272 'description': 'Watch full version of this video at https://slideslive.com/38932460.',
273 'channel': 'SlidesLive Videos - A',
274 'channel_id': 'UC62SdArr41t_-_fX40QCLRw',
275 'channel_url': 'https://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw',
276 'uploader': 'SlidesLive Videos - A',
277 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw',
278 'uploader_url': 'http://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw',
279 'upload_date': '20200903',
280 'timestamp': 1602599092,
281 'duration': 942,
282 'age_limit': 0,
283 'live_status': 'not_live',
284 'playable_in_embed': True,
285 'availability': 'unlisted',
286 'categories': ['People & Blogs'],
287 'tags': [],
288 'channel_follower_count': int,
289 'like_count': int,
290 'view_count': int,
291 'thumbnail': r're:^https?://.*\.(?:jpg|png|webp)',
292 'thumbnails': 'count:21',
293 'chapters': 'count:20',
294 },
295 'params': {
296 'skip_download': 'm3u8',
297 },
5ab3534d 298 }, {
299 # /v3/ slides, .png only, service_name = yoda
300 'url': 'https://slideslive.com/38983994',
301 'info_dict': {
302 'id': '38983994',
303 'ext': 'mp4',
304 'title': 'Zero-Shot AutoML with Pretrained Models',
305 'timestamp': 1662384834,
306 'upload_date': '20220905',
307 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
308 'thumbnails': 'count:23',
309 'chapters': 'count:22',
310 'duration': 295,
311 },
312 'params': {
313 'skip_download': 'm3u8',
314 },
3d667e00 315 }, {
316 # service_name = yoda
aa1d5eb9
RA
317 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend',
318 'only_matching': True,
73d8f3a6 319 }, {
3d667e00 320 # dead link, service_name = url
73d8f3a6
RA
321 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1',
322 'only_matching': True,
323 }, {
3d667e00 324 # dead link, service_name = vimeo
73d8f3a6
RA
325 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3',
326 'only_matching': True,
d0f2d641
JW
327 }]
328
3d667e00 329 _WEBPAGE_TESTS = [{
330 # only XML slides info
331 'url': 'https://iclr.cc/virtual_2020/poster_Hklr204Fvr.html',
332 'info_dict': {
333 'id': '38925850',
334 'ext': 'mp4',
335 'title': 'Towards a Deep Network Architecture for Structured Smoothness',
336 'thumbnail': r're:^https?://.*\.jpg',
337 'thumbnails': 'count:8',
338 'timestamp': 1629671508,
339 'upload_date': '20210822',
340 'chapters': 'count:7',
5ab3534d 341 'duration': 326,
3d667e00 342 },
343 'params': {
344 'skip_download': 'm3u8',
345 },
346 }]
347
348 @classmethod
349 def _extract_embed_urls(cls, url, webpage):
350 # Reference: https://slideslive.com/embed_presentation.js
351 for embed_id in re.findall(r'(?s)new\s+SlidesLiveEmbed\s*\([^)]+\bpresentationId:\s*["\'](\d+)["\']', webpage):
352 url_parsed = urllib.parse.urlparse(url)
353 origin = f'{url_parsed.scheme}://{url_parsed.netloc}'
354 yield update_url_query(
355 f'https://slideslive.com/embed/presentation/{embed_id}', {
356 'embed_parent_url': url,
357 'embed_container_origin': origin,
358 })
359
360 def _download_embed_webpage_handle(self, video_id, headers):
361 return self._download_webpage_handle(
362 f'https://slideslive.com/embed/presentation/{video_id}', video_id,
363 headers=headers, query=traverse_obj(headers, {
364 'embed_parent_url': 'Referer',
365 'embed_container_origin': 'Origin',
366 }))
367
f69b0554 368 def _extract_custom_m3u8_info(self, m3u8_data):
369 m3u8_dict = {}
370
371 lookup = {
372 'PRESENTATION-TITLE': 'title',
373 'PRESENTATION-UPDATED-AT': 'timestamp',
374 'PRESENTATION-THUMBNAIL': 'thumbnail',
375 'PLAYLIST-TYPE': 'playlist_type',
376 'VOD-VIDEO-SERVICE-NAME': 'service_name',
377 'VOD-VIDEO-ID': 'service_id',
378 'VOD-VIDEO-SERVERS': 'video_servers',
379 'VOD-SUBTITLES': 'subtitles',
3d667e00 380 'VOD-SLIDES-JSON-URL': 'slides_json_url',
381 'VOD-SLIDES-XML-URL': 'slides_xml_url',
f69b0554 382 }
383
384 for line in m3u8_data.splitlines():
385 if not line.startswith('#EXT-SL-'):
386 continue
387 tag, _, value = line.partition(':')
388 key = lookup.get(tag.lstrip('#EXT-SL-'))
389 if not key:
390 continue
391 m3u8_dict[key] = value
392
393 # Some values are stringified JSON arrays
394 for key in ('video_servers', 'subtitles'):
395 if key in m3u8_dict:
396 m3u8_dict[key] = self._parse_json(m3u8_dict[key], None, fatal=False) or []
397
398 return m3u8_dict
399
5ab3534d 400 def _extract_formats_and_duration(self, cdn_hostname, path, video_id, skip_duration=False):
401 formats, duration = [], None
402
403 hls_formats = self._extract_m3u8_formats(
3d667e00 404 f'https://{cdn_hostname}/{path}/master.m3u8',
5ab3534d 405 video_id, 'mp4', m3u8_id='hls', fatal=False, live=True)
406 if hls_formats:
407 if not skip_duration:
408 duration = self._extract_m3u8_vod_duration(
409 hls_formats[0]['url'], video_id, note='Extracting duration from HLS manifest')
410 formats.extend(hls_formats)
411
412 dash_formats = self._extract_mpd_formats(
413 f'https://{cdn_hostname}/{path}/master.mpd', video_id, mpd_id='dash', fatal=False)
414 if dash_formats:
415 if not duration and not skip_duration:
416 duration = self._extract_mpd_vod_duration(
417 f'https://{cdn_hostname}/{path}/master.mpd', video_id,
418 note='Extracting duration from DASH manifest')
419 formats.extend(dash_formats)
420
421 return formats, duration
3d667e00 422
d0f2d641
JW
423 def _real_extract(self, url):
424 video_id = self._match_id(url)
3d667e00 425 webpage, urlh = self._download_embed_webpage_handle(
426 video_id, headers=traverse_obj(parse_qs(url), {
427 'Referer': ('embed_parent_url', -1),
428 'Origin': ('embed_container_origin', -1)}))
429 redirect_url = urlh.geturl()
430 if 'domain_not_allowed' in redirect_url:
431 domain = traverse_obj(parse_qs(redirect_url), ('allowed_domains[]', ...), get_all=False)
432 if not domain:
433 raise ExtractorError(
434 'This is an embed-only presentation. Try passing --referer', expected=True)
435 webpage, _ = self._download_embed_webpage_handle(video_id, headers={
436 'Referer': f'https://{domain}/',
437 'Origin': f'https://{domain}',
438 })
439
f69b0554 440 player_token = self._search_regex(r'data-player-token="([^"]+)"', webpage, 'player token')
441 player_data = self._download_webpage(
442 f'https://ben.slideslive.com/player/{video_id}', video_id,
443 note='Downloading player info', query={'player_token': player_token})
444 player_info = self._extract_custom_m3u8_info(player_data)
445
446 service_name = player_info['service_name'].lower()
29f7c58a 447 assert service_name in ('url', 'yoda', 'vimeo', 'youtube')
f69b0554 448 service_id = player_info['service_id']
449
5ab3534d 450 slide_url_template = 'https://slides.slideslive.com/%s/slides/original/%s%s'
451 slides, slides_info = {}, []
452
3d667e00 453 if player_info.get('slides_json_url'):
5ab3534d 454 slides = self._download_json(
455 player_info['slides_json_url'], video_id, fatal=False,
456 note='Downloading slides JSON', errnote=False) or {}
457 slide_ext_default = '.png'
458 slide_quality = traverse_obj(slides, ('slide_qualities', 0))
459 if slide_quality:
460 slide_ext_default = '.jpg'
461 slide_url_template = f'https://cdn.slideslive.com/data/presentations/%s/slides/{slide_quality}/%s%s'
462 for slide_id, slide in enumerate(traverse_obj(slides, ('slides', ...), expected_type=dict), 1):
3d667e00 463 slides_info.append((
464 slide_id, traverse_obj(slide, ('image', 'name')),
5ab3534d 465 traverse_obj(slide, ('image', 'extname'), default=slide_ext_default),
3d667e00 466 int_or_none(slide.get('time'), scale=1000)))
467
468 if not slides and player_info.get('slides_xml_url'):
3d667e00 469 slides = self._download_xml(
5ab3534d 470 player_info['slides_xml_url'], video_id, fatal=False,
3d667e00 471 note='Downloading slides XML', errnote='Failed to download slides info')
5ab3534d 472 slide_url_template = 'https://cdn.slideslive.com/data/presentations/%s/slides/big/%s%s'
473 for slide_id, slide in enumerate(slides.findall('./slide') if slides else [], 1):
3d667e00 474 slides_info.append((
5ab3534d 475 slide_id, xpath_text(slide, './slideName', 'name'), '.jpg',
3d667e00 476 int_or_none(xpath_text(slide, './timeSec', 'time'))))
477
3d667e00 478 chapters, thumbnails = [], []
479 if url_or_none(player_info.get('thumbnail')):
480 thumbnails.append({'id': 'cover', 'url': player_info['thumbnail']})
5ab3534d 481 for slide_id, slide_path, slide_ext, start_time in slides_info:
3d667e00 482 if slide_path:
483 thumbnails.append({
484 'id': f'{slide_id:03d}',
5ab3534d 485 'url': slide_url_template % (video_id, slide_path, slide_ext),
3d667e00 486 })
487 chapters.append({
488 'title': f'Slide {slide_id:03d}',
489 'start_time': start_time,
490 })
491
29f7c58a 492 subtitles = {}
f69b0554 493 for sub in traverse_obj(player_info, ('subtitles', ...), expected_type=dict):
29f7c58a 494 webvtt_url = url_or_none(sub.get('webvtt_url'))
495 if not webvtt_url:
496 continue
f69b0554 497 subtitles.setdefault(sub.get('language') or 'en', []).append({
29f7c58a 498 'url': webvtt_url,
f69b0554 499 'ext': 'vtt',
29f7c58a 500 })
f69b0554 501
73d8f3a6
RA
502 info = {
503 'id': video_id,
f69b0554 504 'title': player_info.get('title') or self._html_search_meta('title', webpage, default=''),
505 'timestamp': unified_timestamp(player_info.get('timestamp')),
506 'is_live': player_info.get('playlist_type') != 'vod',
3d667e00 507 'thumbnails': thumbnails,
508 'chapters': chapters,
29f7c58a 509 'subtitles': subtitles,
73d8f3a6 510 }
f69b0554 511
3d667e00 512 if service_name == 'url':
513 info['url'] = service_id
514 elif service_name == 'yoda':
5ab3534d 515 formats, duration = self._extract_formats_and_duration(
516 player_info['video_servers'][0], service_id, video_id)
517 info.update({
518 'duration': duration,
519 'formats': formats,
520 })
73d8f3a6
RA
521 else:
522 info.update({
b33a05d2 523 '_type': 'url_transparent',
29f7c58a 524 'url': service_id,
73d8f3a6 525 'ie_key': service_name.capitalize(),
f69b0554 526 'display_id': video_id,
73d8f3a6
RA
527 })
528 if service_name == 'vimeo':
529 info['url'] = smuggle_url(
f69b0554 530 f'https://player.vimeo.com/video/{service_id}',
73d8f3a6 531 {'http_headers': {'Referer': url}})
f69b0554 532
5ab3534d 533 video_slides = traverse_obj(slides, ('slides', ..., 'video', 'id'))
3d667e00 534 if not video_slides:
535 return info
536
537 def entries():
538 yield info
539
540 service_data = self._download_json(
541 f'https://ben.slideslive.com/player/{video_id}/slides_video_service_data',
542 video_id, fatal=False, query={
543 'player_token': player_token,
544 'videos': ','.join(video_slides),
545 }, note='Downloading video slides info', errnote='Failed to download video slides info') or {}
546
5ab3534d 547 for slide_id, slide in enumerate(traverse_obj(slides, ('slides', ...)), 1):
3d667e00 548 if not traverse_obj(slide, ('video', 'service')) == 'yoda':
549 continue
550 video_path = traverse_obj(slide, ('video', 'id'))
551 cdn_hostname = traverse_obj(service_data, (
552 video_path, 'video_servers', ...), get_all=False)
553 if not cdn_hostname or not video_path:
554 continue
5ab3534d 555 formats, _ = self._extract_formats_and_duration(
556 cdn_hostname, video_path, video_id, skip_duration=True)
3d667e00 557 if not formats:
558 continue
559 yield {
560 'id': f'{video_id}-{slide_id:03d}',
561 'title': f'{info["title"]} - Slide {slide_id:03d}',
562 'timestamp': info['timestamp'],
563 'duration': int_or_none(traverse_obj(slide, ('video', 'duration_ms')), scale=1000),
564 'formats': formats,
565 }
566
567 return self.playlist_result(entries(), f'{video_id}-playlist', info['title'])