]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/slideslive.py
[cleanup] Fix misc bugs (#8968)
[yt-dlp.git] / yt_dlp / extractor / slideslive.py
1 import re
2 import urllib.parse
3 import xml.etree.ElementTree
4
5 from .common import InfoExtractor
6 from ..utils import (
7 ExtractorError,
8 int_or_none,
9 parse_qs,
10 smuggle_url,
11 traverse_obj,
12 unified_timestamp,
13 update_url_query,
14 url_or_none,
15 xpath_text,
16 )
17
18
19 class SlidesLiveIE(InfoExtractor):
20 _VALID_URL = r'https?://slideslive\.com/(?:embed/(?:presentation/)?)?(?P<id>[0-9]+)'
21 _TESTS = [{
22 # service_name = yoda, only XML slides info
23 'url': 'https://slideslive.com/38902413/gcc-ia16-backend',
24 'info_dict': {
25 'id': '38902413',
26 'ext': 'mp4',
27 'title': 'GCC IA16 backend',
28 'timestamp': 1648189972,
29 'upload_date': '20220325',
30 'thumbnail': r're:^https?://.*\.jpg',
31 'thumbnails': 'count:42',
32 'chapters': 'count:41',
33 'duration': 1638,
34 },
35 'params': {
36 'skip_download': 'm3u8',
37 },
38 }, {
39 # service_name = yoda, /v7/ slides
40 'url': 'https://slideslive.com/38935785',
41 'info_dict': {
42 'id': '38935785',
43 'ext': 'mp4',
44 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges',
45 'upload_date': '20211115',
46 'timestamp': 1636996003,
47 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
48 'thumbnails': 'count:640',
49 'chapters': 'count:639',
50 'duration': 9832,
51 },
52 'params': {
53 'skip_download': 'm3u8',
54 },
55 }, {
56 # service_name = yoda, /v1/ slides
57 'url': 'https://slideslive.com/38973182/how-should-a-machine-learning-researcher-think-about-ai-ethics',
58 'info_dict': {
59 'id': '38973182',
60 'ext': 'mp4',
61 'title': 'How Should a Machine Learning Researcher Think About AI Ethics?',
62 'upload_date': '20220201',
63 'thumbnail': r're:^https?://.*\.jpg',
64 'timestamp': 1643728135,
65 'thumbnails': 'count:3',
66 'chapters': 'count:2',
67 'duration': 5889,
68 },
69 'params': {
70 'skip_download': 'm3u8',
71 },
72 }, {
73 # service_name = youtube, only XML slides info
74 'url': 'https://slideslive.com/38897546/special-metaprednaska-petra-ludwiga-hodnoty-pro-lepsi-spolecnost',
75 'md5': '8a79b5e3d700837f40bd2afca3c8fa01',
76 'info_dict': {
77 'id': 'jmg02wCJD5M',
78 'display_id': '38897546',
79 'ext': 'mp4',
80 'title': 'SPECIÁL: Meta-přednáška Petra Ludwiga - Hodnoty pro lepší společnost',
81 'description': 'Watch full version of this video at https://slideslive.com/38897546.',
82 'channel_url': 'https://www.youtube.com/channel/UCZWdAkNYFncuX0khyvhqnxw',
83 'channel': 'SlidesLive Videos - G1',
84 'channel_id': 'UCZWdAkNYFncuX0khyvhqnxw',
85 'uploader_id': 'UCZWdAkNYFncuX0khyvhqnxw',
86 'uploader': 'SlidesLive Videos - G1',
87 'uploader_url': 'http://www.youtube.com/channel/UCZWdAkNYFncuX0khyvhqnxw',
88 'live_status': 'not_live',
89 'upload_date': '20160710',
90 'timestamp': 1618786715,
91 'duration': 6827,
92 'like_count': int,
93 'view_count': int,
94 'comment_count': int,
95 'channel_follower_count': int,
96 'age_limit': 0,
97 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
98 'thumbnails': 'count:169',
99 'playable_in_embed': True,
100 'availability': 'unlisted',
101 'tags': [],
102 'categories': ['People & Blogs'],
103 'chapters': 'count:168',
104 },
105 }, {
106 # embed-only presentation, only XML slides info
107 'url': 'https://slideslive.com/embed/presentation/38925850',
108 'info_dict': {
109 'id': '38925850',
110 'ext': 'mp4',
111 'title': 'Towards a Deep Network Architecture for Structured Smoothness',
112 'thumbnail': r're:^https?://.*\.jpg',
113 'thumbnails': 'count:8',
114 'timestamp': 1629671508,
115 'upload_date': '20210822',
116 'chapters': 'count:7',
117 'duration': 326,
118 },
119 'params': {
120 'skip_download': 'm3u8',
121 },
122 }, {
123 # embed-only presentation, only JSON slides info, /v5/ slides (.png)
124 'url': 'https://slideslive.com/38979920/',
125 'info_dict': {
126 'id': '38979920',
127 'ext': 'mp4',
128 'title': 'MoReL: Multi-omics Relational Learning',
129 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
130 'thumbnails': 'count:7',
131 'timestamp': 1654714970,
132 'upload_date': '20220608',
133 'chapters': 'count:6',
134 'duration': 171,
135 },
136 'params': {
137 'skip_download': 'm3u8',
138 },
139 }, {
140 # /v2/ slides (.jpg)
141 'url': 'https://slideslive.com/38954074',
142 'info_dict': {
143 'id': '38954074',
144 'ext': 'mp4',
145 'title': 'Decentralized Attribution of Generative Models',
146 'thumbnail': r're:^https?://.*\.jpg',
147 'thumbnails': 'count:16',
148 'timestamp': 1622806321,
149 'upload_date': '20210604',
150 'chapters': 'count:15',
151 'duration': 306,
152 },
153 'params': {
154 'skip_download': 'm3u8',
155 },
156 }, {
157 # /v4/ slides (.png)
158 'url': 'https://slideslive.com/38979570/',
159 'info_dict': {
160 'id': '38979570',
161 'ext': 'mp4',
162 'title': 'Efficient Active Search for Combinatorial Optimization Problems',
163 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
164 'thumbnails': 'count:9',
165 'timestamp': 1654714896,
166 'upload_date': '20220608',
167 'chapters': 'count:8',
168 'duration': 295,
169 },
170 'params': {
171 'skip_download': 'm3u8',
172 },
173 }, {
174 # /v10/ slides
175 'url': 'https://slideslive.com/embed/presentation/38979880?embed_parent_url=https%3A%2F%2Fedit.videoken.com%2F',
176 'info_dict': {
177 'id': '38979880',
178 'ext': 'mp4',
179 'title': 'The Representation Power of Neural Networks',
180 'timestamp': 1654714962,
181 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
182 'thumbnails': 'count:22',
183 'upload_date': '20220608',
184 'chapters': 'count:21',
185 'duration': 294,
186 },
187 'params': {
188 'skip_download': 'm3u8',
189 },
190 }, {
191 # /v7/ slides, 2 video slides
192 'url': 'https://slideslive.com/embed/presentation/38979682?embed_container_origin=https%3A%2F%2Fedit.videoken.com',
193 'playlist_count': 3,
194 'info_dict': {
195 'id': '38979682-playlist',
196 'title': 'LoRA: Low-Rank Adaptation of Large Language Models',
197 },
198 'playlist': [{
199 'info_dict': {
200 'id': '38979682',
201 'ext': 'mp4',
202 'title': 'LoRA: Low-Rank Adaptation of Large Language Models',
203 'timestamp': 1654714920,
204 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
205 'thumbnails': 'count:30',
206 'upload_date': '20220608',
207 'chapters': 'count:31',
208 'duration': 272,
209 },
210 }, {
211 'info_dict': {
212 'id': '38979682-021',
213 'ext': 'mp4',
214 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 021',
215 'duration': 3,
216 'timestamp': 1654714920,
217 'upload_date': '20220608',
218 },
219 }, {
220 'info_dict': {
221 'id': '38979682-024',
222 'ext': 'mp4',
223 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 024',
224 'duration': 4,
225 'timestamp': 1654714920,
226 'upload_date': '20220608',
227 },
228 }],
229 'params': {
230 'skip_download': 'm3u8',
231 },
232 }, {
233 # /v6/ slides, 1 video slide, edit.videoken.com embed
234 'url': 'https://slideslive.com/38979481/',
235 'playlist_count': 2,
236 'info_dict': {
237 'id': '38979481-playlist',
238 'title': 'How to Train Your MAML to Excel in Few-Shot Classification',
239 },
240 'playlist': [{
241 'info_dict': {
242 'id': '38979481',
243 'ext': 'mp4',
244 'title': 'How to Train Your MAML to Excel in Few-Shot Classification',
245 'timestamp': 1654714877,
246 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
247 'thumbnails': 'count:43',
248 'upload_date': '20220608',
249 'chapters': 'count:43',
250 'duration': 315,
251 },
252 }, {
253 'info_dict': {
254 'id': '38979481-013',
255 'ext': 'mp4',
256 'title': 'How to Train Your MAML to Excel in Few-Shot Classification - Slide 013',
257 'duration': 3,
258 'timestamp': 1654714877,
259 'upload_date': '20220608',
260 },
261 }],
262 'params': {
263 'skip_download': 'm3u8',
264 },
265 }, {
266 # /v3/ slides, .jpg and .png, service_name = youtube
267 'url': 'https://slideslive.com/embed/38932460/',
268 'info_dict': {
269 'id': 'RTPdrgkyTiE',
270 'display_id': '38932460',
271 'ext': 'mp4',
272 'title': 'Active Learning for Hierarchical Multi-Label Classification',
273 'description': 'Watch full version of this video at https://slideslive.com/38932460.',
274 'channel': 'SlidesLive Videos - A',
275 'channel_id': 'UC62SdArr41t_-_fX40QCLRw',
276 'channel_url': 'https://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw',
277 'uploader': 'SlidesLive Videos - A',
278 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw',
279 'uploader_url': 'http://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw',
280 'upload_date': '20200903',
281 'timestamp': 1602599092,
282 'duration': 942,
283 'age_limit': 0,
284 'live_status': 'not_live',
285 'playable_in_embed': True,
286 'availability': 'unlisted',
287 'categories': ['People & Blogs'],
288 'tags': [],
289 'channel_follower_count': int,
290 'like_count': int,
291 'view_count': int,
292 'thumbnail': r're:^https?://.*\.(?:jpg|png|webp)',
293 'thumbnails': 'count:21',
294 'chapters': 'count:20',
295 },
296 'params': {
297 'skip_download': 'm3u8',
298 },
299 }, {
300 # /v3/ slides, .png only, service_name = yoda
301 'url': 'https://slideslive.com/38983994',
302 'info_dict': {
303 'id': '38983994',
304 'ext': 'mp4',
305 'title': 'Zero-Shot AutoML with Pretrained Models',
306 'timestamp': 1662384834,
307 'upload_date': '20220905',
308 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
309 'thumbnails': 'count:23',
310 'chapters': 'count:22',
311 'duration': 295,
312 },
313 'params': {
314 'skip_download': 'm3u8',
315 },
316 }, {
317 # service_name = yoda
318 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend',
319 'only_matching': True,
320 }, {
321 # dead link, service_name = url
322 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1',
323 'only_matching': True,
324 }, {
325 # dead link, service_name = vimeo
326 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3',
327 'only_matching': True,
328 }]
329
330 _WEBPAGE_TESTS = [{
331 # only XML slides info
332 'url': 'https://iclr.cc/virtual_2020/poster_Hklr204Fvr.html',
333 'info_dict': {
334 'id': '38925850',
335 'ext': 'mp4',
336 'title': 'Towards a Deep Network Architecture for Structured Smoothness',
337 'thumbnail': r're:^https?://.*\.jpg',
338 'thumbnails': 'count:8',
339 'timestamp': 1629671508,
340 'upload_date': '20210822',
341 'chapters': 'count:7',
342 'duration': 326,
343 },
344 'params': {
345 'skip_download': 'm3u8',
346 },
347 }]
348
349 @classmethod
350 def _extract_embed_urls(cls, url, webpage):
351 # Reference: https://slideslive.com/embed_presentation.js
352 for embed_id in re.findall(r'(?s)new\s+SlidesLiveEmbed\s*\([^)]+\bpresentationId:\s*["\'](\d+)["\']', webpage):
353 url_parsed = urllib.parse.urlparse(url)
354 origin = f'{url_parsed.scheme}://{url_parsed.netloc}'
355 yield update_url_query(
356 f'https://slideslive.com/embed/presentation/{embed_id}', {
357 'embed_parent_url': url,
358 'embed_container_origin': origin,
359 })
360
361 def _download_embed_webpage_handle(self, video_id, headers):
362 return self._download_webpage_handle(
363 f'https://slideslive.com/embed/presentation/{video_id}', video_id,
364 headers=headers, query=traverse_obj(headers, {
365 'embed_parent_url': 'Referer',
366 'embed_container_origin': 'Origin',
367 }))
368
369 def _extract_custom_m3u8_info(self, m3u8_data):
370 m3u8_dict = {}
371
372 lookup = {
373 'PRESENTATION-TITLE': 'title',
374 'PRESENTATION-UPDATED-AT': 'timestamp',
375 'PRESENTATION-THUMBNAIL': 'thumbnail',
376 'PLAYLIST-TYPE': 'playlist_type',
377 'VOD-VIDEO-SERVICE-NAME': 'service_name',
378 'VOD-VIDEO-ID': 'service_id',
379 'VOD-VIDEO-SERVERS': 'video_servers',
380 'VOD-SUBTITLES': 'subtitles',
381 'VOD-SLIDES-JSON-URL': 'slides_json_url',
382 'VOD-SLIDES-XML-URL': 'slides_xml_url',
383 }
384
385 for line in m3u8_data.splitlines():
386 if not line.startswith('#EXT-SL-'):
387 continue
388 tag, _, value = line.partition(':')
389 key = lookup.get(tag[8:])
390 if not key:
391 continue
392 m3u8_dict[key] = value
393
394 # Some values are stringified JSON arrays
395 for key in ('video_servers', 'subtitles'):
396 if key in m3u8_dict:
397 m3u8_dict[key] = self._parse_json(m3u8_dict[key], None, fatal=False) or []
398
399 return m3u8_dict
400
401 def _extract_formats_and_duration(self, cdn_hostname, path, video_id, skip_duration=False):
402 formats, duration = [], None
403
404 hls_formats = self._extract_m3u8_formats(
405 f'https://{cdn_hostname}/{path}/master.m3u8',
406 video_id, 'mp4', m3u8_id='hls', fatal=False, live=True)
407 if hls_formats:
408 if not skip_duration:
409 duration = self._extract_m3u8_vod_duration(
410 hls_formats[0]['url'], video_id, note='Extracting duration from HLS manifest')
411 formats.extend(hls_formats)
412
413 dash_formats = self._extract_mpd_formats(
414 f'https://{cdn_hostname}/{path}/master.mpd', video_id, mpd_id='dash', fatal=False)
415 if dash_formats:
416 if not duration and not skip_duration:
417 duration = self._extract_mpd_vod_duration(
418 f'https://{cdn_hostname}/{path}/master.mpd', video_id,
419 note='Extracting duration from DASH manifest')
420 formats.extend(dash_formats)
421
422 return formats, duration
423
424 def _real_extract(self, url):
425 video_id = self._match_id(url)
426 webpage, urlh = self._download_embed_webpage_handle(
427 video_id, headers=traverse_obj(parse_qs(url), {
428 'Referer': ('embed_parent_url', -1),
429 'Origin': ('embed_container_origin', -1)}))
430 redirect_url = urlh.url
431 if 'domain_not_allowed' in redirect_url:
432 domain = traverse_obj(parse_qs(redirect_url), ('allowed_domains[]', ...), get_all=False)
433 if not domain:
434 raise ExtractorError(
435 'This is an embed-only presentation. Try passing --referer', expected=True)
436 webpage, _ = self._download_embed_webpage_handle(video_id, headers={
437 'Referer': f'https://{domain}/',
438 'Origin': f'https://{domain}',
439 })
440
441 player_token = self._search_regex(r'data-player-token="([^"]+)"', webpage, 'player token')
442 player_data = self._download_webpage(
443 f'https://ben.slideslive.com/player/{video_id}', video_id,
444 note='Downloading player info', query={'player_token': player_token})
445 player_info = self._extract_custom_m3u8_info(player_data)
446
447 service_name = player_info['service_name'].lower()
448 assert service_name in ('url', 'yoda', 'vimeo', 'youtube')
449 service_id = player_info['service_id']
450
451 slide_url_template = 'https://slides.slideslive.com/%s/slides/original/%s%s'
452 slides, slides_info = {}, []
453
454 if player_info.get('slides_json_url'):
455 slides = self._download_json(
456 player_info['slides_json_url'], video_id, fatal=False,
457 note='Downloading slides JSON', errnote=False) or {}
458 slide_ext_default = '.png'
459 slide_quality = traverse_obj(slides, ('slide_qualities', 0))
460 if slide_quality:
461 slide_ext_default = '.jpg'
462 slide_url_template = f'https://cdn.slideslive.com/data/presentations/%s/slides/{slide_quality}/%s%s'
463 for slide_id, slide in enumerate(traverse_obj(slides, ('slides', ...), expected_type=dict), 1):
464 slides_info.append((
465 slide_id, traverse_obj(slide, ('image', 'name')),
466 traverse_obj(slide, ('image', 'extname'), default=slide_ext_default),
467 int_or_none(slide.get('time'), scale=1000)))
468
469 if not slides and player_info.get('slides_xml_url'):
470 slides = self._download_xml(
471 player_info['slides_xml_url'], video_id, fatal=False,
472 note='Downloading slides XML', errnote='Failed to download slides info')
473 if isinstance(slides, xml.etree.ElementTree.Element):
474 slide_url_template = 'https://cdn.slideslive.com/data/presentations/%s/slides/big/%s%s'
475 for slide_id, slide in enumerate(slides.findall('./slide')):
476 slides_info.append((
477 slide_id, xpath_text(slide, './slideName', 'name'), '.jpg',
478 int_or_none(xpath_text(slide, './timeSec', 'time'))))
479
480 chapters, thumbnails = [], []
481 if url_or_none(player_info.get('thumbnail')):
482 thumbnails.append({'id': 'cover', 'url': player_info['thumbnail']})
483 for slide_id, slide_path, slide_ext, start_time in slides_info:
484 if slide_path:
485 thumbnails.append({
486 'id': f'{slide_id:03d}',
487 'url': slide_url_template % (video_id, slide_path, slide_ext),
488 })
489 chapters.append({
490 'title': f'Slide {slide_id:03d}',
491 'start_time': start_time,
492 })
493
494 subtitles = {}
495 for sub in traverse_obj(player_info, ('subtitles', ...), expected_type=dict):
496 webvtt_url = url_or_none(sub.get('webvtt_url'))
497 if not webvtt_url:
498 continue
499 subtitles.setdefault(sub.get('language') or 'en', []).append({
500 'url': webvtt_url,
501 'ext': 'vtt',
502 })
503
504 info = {
505 'id': video_id,
506 'title': player_info.get('title') or self._html_search_meta('title', webpage, default=''),
507 'timestamp': unified_timestamp(player_info.get('timestamp')),
508 'is_live': player_info.get('playlist_type') != 'vod',
509 'thumbnails': thumbnails,
510 'chapters': chapters,
511 'subtitles': subtitles,
512 }
513
514 if service_name == 'url':
515 info['url'] = service_id
516 elif service_name == 'yoda':
517 formats, duration = self._extract_formats_and_duration(
518 player_info['video_servers'][0], service_id, video_id)
519 info.update({
520 'duration': duration,
521 'formats': formats,
522 })
523 else:
524 info.update({
525 '_type': 'url_transparent',
526 'url': service_id,
527 'ie_key': service_name.capitalize(),
528 'display_id': video_id,
529 })
530 if service_name == 'vimeo':
531 info['url'] = smuggle_url(
532 f'https://player.vimeo.com/video/{service_id}',
533 {'referer': url})
534
535 video_slides = traverse_obj(slides, ('slides', ..., 'video', 'id'))
536 if not video_slides:
537 return info
538
539 def entries():
540 yield info
541
542 service_data = self._download_json(
543 f'https://ben.slideslive.com/player/{video_id}/slides_video_service_data',
544 video_id, fatal=False, query={
545 'player_token': player_token,
546 'videos': ','.join(video_slides),
547 }, note='Downloading video slides info', errnote='Failed to download video slides info') or {}
548
549 for slide_id, slide in enumerate(traverse_obj(slides, ('slides', ...)), 1):
550 if not traverse_obj(slide, ('video', 'service')) == 'yoda':
551 continue
552 video_path = traverse_obj(slide, ('video', 'id'))
553 cdn_hostname = traverse_obj(service_data, (
554 video_path, 'video_servers', ...), get_all=False)
555 if not cdn_hostname or not video_path:
556 continue
557 formats, _ = self._extract_formats_and_duration(
558 cdn_hostname, video_path, video_id, skip_duration=True)
559 if not formats:
560 continue
561 yield {
562 'id': f'{video_id}-{slide_id:03d}',
563 'title': f'{info["title"]} - Slide {slide_id:03d}',
564 'timestamp': info['timestamp'],
565 'duration': int_or_none(traverse_obj(slide, ('video', 'duration_ms')), scale=1000),
566 'formats': formats,
567 }
568
569 return self.playlist_result(entries(), f'{video_id}-playlist', info['title'])