]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/videoken.py
[ie] Do not smuggle `http_headers`
[yt-dlp.git] / yt_dlp / extractor / videoken.py
1 import base64
2 import functools
3 import math
4 import re
5 import time
6 import urllib.parse
7
8 from .common import InfoExtractor
9 from .slideslive import SlidesLiveIE
10 from ..utils import (
11 ExtractorError,
12 InAdvancePagedList,
13 int_or_none,
14 traverse_obj,
15 update_url_query,
16 url_or_none,
17 )
18
19
20 class VideoKenBaseIE(InfoExtractor):
21 _ORGANIZATIONS = {
22 'videos.icts.res.in': 'icts',
23 'videos.cncf.io': 'cncf',
24 'videos.neurips.cc': 'neurips',
25 }
26 _BASE_URL_RE = rf'https?://(?P<host>{"|".join(map(re.escape, _ORGANIZATIONS))})/'
27
28 _PAGE_SIZE = 12
29
30 def _get_org_id_and_api_key(self, org, video_id):
31 details = self._download_json(
32 f'https://analytics.videoken.com/api/videolake/{org}/details', video_id,
33 note='Downloading organization ID and API key', headers={
34 'Accept': 'application/json',
35 })
36 return details['id'], details['apikey']
37
38 def _create_slideslive_url(self, video_url, video_id, referer):
39 if not video_url and not video_id:
40 return
41 elif not video_url or 'embed/sign-in' in video_url:
42 video_url = f'https://slideslive.com/embed/{video_id.lstrip("slideslive-")}'
43 if url_or_none(referer):
44 return update_url_query(video_url, {
45 'embed_parent_url': referer,
46 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).netloc}',
47 })
48 return video_url
49
50 def _extract_videos(self, videos, url):
51 for video in traverse_obj(videos, (('videos', 'results'), ...)):
52 video_id = traverse_obj(video, 'youtube_id', 'videoid')
53 if not video_id:
54 continue
55 ie_key = None
56 if traverse_obj(video, 'type', 'source') == 'youtube':
57 video_url = video_id
58 ie_key = 'Youtube'
59 else:
60 video_url = traverse_obj(video, 'embed_url', 'embeddableurl')
61 if urllib.parse.urlparse(video_url).netloc == 'slideslive.com':
62 ie_key = SlidesLiveIE
63 video_url = self._create_slideslive_url(video_url, video_id, url)
64 if not video_url:
65 continue
66 yield self.url_result(video_url, ie_key, video_id)
67
68
69 class VideoKenIE(VideoKenBaseIE):
70 _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:(?:topic|category)/[^/#?]+/)?video/(?P<id>[\w-]+)'
71 _TESTS = [{
72 # neurips -> videoken -> slideslive
73 'url': 'https://videos.neurips.cc/video/slideslive-38922815',
74 'info_dict': {
75 'id': '38922815',
76 'ext': 'mp4',
77 'title': 'Efficient Processing of Deep Neural Network: from Algorithms to Hardware Architectures',
78 'timestamp': 1630939331,
79 'upload_date': '20210906',
80 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
81 'thumbnails': 'count:330',
82 'chapters': 'count:329',
83 },
84 'params': {
85 'skip_download': 'm3u8',
86 },
87 'expected_warnings': ['Failed to download VideoKen API JSON'],
88 }, {
89 # neurips -> videoken -> slideslive -> youtube
90 'url': 'https://videos.neurips.cc/topic/machine%20learning/video/slideslive-38923348',
91 'info_dict': {
92 'id': '2Xa_dt78rJE',
93 'ext': 'mp4',
94 'display_id': '38923348',
95 'title': 'Machine Education',
96 'description': 'Watch full version of this video at https://slideslive.com/38923348.',
97 'channel': 'SlidesLive Videos - G2',
98 'channel_id': 'UCOExahQQ588Da8Nft_Ltb9w',
99 'channel_url': 'https://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
100 'uploader': 'SlidesLive Videos - G2',
101 'uploader_id': 'UCOExahQQ588Da8Nft_Ltb9w',
102 'uploader_url': 'http://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
103 'duration': 2504,
104 'timestamp': 1618922125,
105 'upload_date': '20200131',
106 'age_limit': 0,
107 'channel_follower_count': int,
108 'view_count': int,
109 'availability': 'unlisted',
110 'live_status': 'not_live',
111 'playable_in_embed': True,
112 'categories': ['People & Blogs'],
113 'tags': [],
114 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
115 'thumbnails': 'count:78',
116 'chapters': 'count:77',
117 },
118 'params': {
119 'skip_download': 'm3u8',
120 },
121 'expected_warnings': ['Failed to download VideoKen API JSON'],
122 }, {
123 # icts -> videoken -> youtube
124 'url': 'https://videos.icts.res.in/topic/random%20variable/video/zysIsojYdvc',
125 'info_dict': {
126 'id': 'zysIsojYdvc',
127 'ext': 'mp4',
128 'title': 'Small-worlds, complex networks and random graphs (Lecture 3) by Remco van der Hofstad',
129 'description': 'md5:87433069d79719eeadc1962cc2ace00b',
130 'channel': 'International Centre for Theoretical Sciences',
131 'channel_id': 'UCO3xnVTHzB7l-nc8mABUJIQ',
132 'channel_url': 'https://www.youtube.com/channel/UCO3xnVTHzB7l-nc8mABUJIQ',
133 'uploader': 'International Centre for Theoretical Sciences',
134 'uploader_id': 'ICTStalks',
135 'uploader_url': 'http://www.youtube.com/user/ICTStalks',
136 'duration': 3372,
137 'upload_date': '20191004',
138 'age_limit': 0,
139 'live_status': 'not_live',
140 'availability': 'public',
141 'playable_in_embed': True,
142 'channel_follower_count': int,
143 'like_count': int,
144 'view_count': int,
145 'categories': ['Science & Technology'],
146 'tags': [],
147 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
148 'thumbnails': 'count:42',
149 'chapters': 'count:20',
150 },
151 'params': {
152 'skip_download': 'm3u8',
153 },
154 }, {
155 'url': 'https://videos.cncf.io/category/478/video/IL4nxbmUIX8',
156 'only_matching': True,
157 }, {
158 'url': 'https://videos.cncf.io/topic/kubernetes/video/YAM2d7yTrrI',
159 'only_matching': True,
160 }, {
161 'url': 'https://videos.icts.res.in/video/d7HuP_abpKU',
162 'only_matching': True,
163 }]
164
165 def _real_extract(self, url):
166 hostname, video_id = self._match_valid_url(url).group('host', 'id')
167 org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], video_id)
168 details = self._download_json(
169 'https://analytics.videoken.com/api/videoinfo_private', video_id, query={
170 'videoid': video_id,
171 'org_id': org_id,
172 }, headers={'Accept': 'application/json'}, note='Downloading VideoKen API JSON',
173 errnote='Failed to download VideoKen API JSON', fatal=False)
174 if details:
175 return next(self._extract_videos({'videos': [details]}, url))
176 # fallback for API error 400 response
177 elif video_id.startswith('slideslive-'):
178 return self.url_result(
179 self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
180 elif re.match(r'^[\w-]{11}$', video_id):
181 self.url_result(video_id, 'Youtube', video_id)
182 else:
183 raise ExtractorError('Unable to extract without VideoKen API response')
184
185
186 class VideoKenPlayerIE(VideoKenBaseIE):
187 _VALID_URL = r'https?://player\.videoken\.com/embed/slideslive-(?P<id>\d+)'
188 _TESTS = [{
189 'url': 'https://player.videoken.com/embed/slideslive-38968434',
190 'info_dict': {
191 'id': '38968434',
192 'ext': 'mp4',
193 'title': 'Deep Learning with Label Differential Privacy',
194 'timestamp': 1643377020,
195 'upload_date': '20220128',
196 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
197 'thumbnails': 'count:30',
198 'chapters': 'count:29',
199 },
200 'params': {
201 'skip_download': 'm3u8',
202 },
203 }]
204
205 def _real_extract(self, url):
206 video_id = self._match_id(url)
207 return self.url_result(
208 self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
209
210
211 class VideoKenPlaylistIE(VideoKenBaseIE):
212 _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:category/\d+/)?playlist/(?P<id>\d+)'
213 _TESTS = [{
214 'url': 'https://videos.icts.res.in/category/1822/playlist/381',
215 'playlist_mincount': 117,
216 'info_dict': {
217 'id': '381',
218 'title': 'Cosmology - The Next Decade',
219 },
220 }]
221
222 def _real_extract(self, url):
223 hostname, playlist_id = self._match_valid_url(url).group('host', 'id')
224 org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], playlist_id)
225 videos = self._download_json(
226 f'https://analytics.videoken.com/api/{org_id}/playlistitems/{playlist_id}/',
227 playlist_id, headers={'Accept': 'application/json'}, note='Downloading API JSON')
228 return self.playlist_result(self._extract_videos(videos, url), playlist_id, videos.get('title'))
229
230
231 class VideoKenCategoryIE(VideoKenBaseIE):
232 _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'category/(?P<id>\d+)/?(?:$|[?#])'
233 _TESTS = [{
234 'url': 'https://videos.icts.res.in/category/1822/',
235 'playlist_mincount': 500,
236 'info_dict': {
237 'id': '1822',
238 'title': 'Programs',
239 },
240 }, {
241 'url': 'https://videos.neurips.cc/category/350/',
242 'playlist_mincount': 34,
243 'info_dict': {
244 'id': '350',
245 'title': 'NeurIPS 2018',
246 },
247 }, {
248 'url': 'https://videos.cncf.io/category/479/',
249 'playlist_mincount': 328,
250 'info_dict': {
251 'id': '479',
252 'title': 'KubeCon + CloudNativeCon Europe\'19',
253 },
254 }]
255
256 def _get_category_page(self, category_id, org_id, page=1, note=None):
257 return self._download_json(
258 f'https://analytics.videoken.com/api/videolake/{org_id}/category_videos', category_id,
259 fatal=False, note=note if note else f'Downloading category page {page}',
260 query={
261 'category_id': category_id,
262 'page_number': page,
263 'length': self._PAGE_SIZE,
264 }, headers={'Accept': 'application/json'}) or {}
265
266 def _entries(self, category_id, org_id, url, page):
267 videos = self._get_category_page(category_id, org_id, page + 1)
268 yield from self._extract_videos(videos, url)
269
270 def _real_extract(self, url):
271 hostname, category_id = self._match_valid_url(url).group('host', 'id')
272 org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], category_id)
273 category_info = self._get_category_page(category_id, org_id, note='Downloading category info')
274 category = category_info['category_name']
275 total_pages = math.ceil(int(category_info['recordsTotal']) / self._PAGE_SIZE)
276 return self.playlist_result(InAdvancePagedList(
277 functools.partial(self._entries, category_id, org_id, url),
278 total_pages, self._PAGE_SIZE), category_id, category)
279
280
281 class VideoKenTopicIE(VideoKenBaseIE):
282 _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'topic/(?P<id>[^/#?]+)/?(?:$|[?#])'
283 _TESTS = [{
284 'url': 'https://videos.neurips.cc/topic/machine%20learning/',
285 'playlist_mincount': 500,
286 'info_dict': {
287 'id': 'machine_learning',
288 'title': 'machine learning',
289 },
290 }, {
291 'url': 'https://videos.icts.res.in/topic/gravitational%20waves/',
292 'playlist_mincount': 77,
293 'info_dict': {
294 'id': 'gravitational_waves',
295 'title': 'gravitational waves'
296 },
297 }, {
298 'url': 'https://videos.cncf.io/topic/prometheus/',
299 'playlist_mincount': 134,
300 'info_dict': {
301 'id': 'prometheus',
302 'title': 'prometheus',
303 },
304 }]
305
306 def _get_topic_page(self, topic, org_id, search_id, api_key, page=1, note=None):
307 return self._download_json(
308 'https://es.videoken.com/api/v1.0/get_results', topic, fatal=False, query={
309 'orgid': org_id,
310 'size': self._PAGE_SIZE,
311 'query': topic,
312 'page': page,
313 'sort': 'upload_desc',
314 'filter': 'all',
315 'token': api_key,
316 'is_topic': 'true',
317 'category': '',
318 'searchid': search_id,
319 }, headers={'Accept': 'application/json'},
320 note=note if note else f'Downloading topic page {page}') or {}
321
322 def _entries(self, topic, org_id, search_id, api_key, url, page):
323 videos = self._get_topic_page(topic, org_id, search_id, api_key, page + 1)
324 yield from self._extract_videos(videos, url)
325
326 def _real_extract(self, url):
327 hostname, topic_id = self._match_valid_url(url).group('host', 'id')
328 topic = urllib.parse.unquote(topic_id)
329 topic_id = topic.replace(' ', '_')
330 org_id, api_key = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], topic)
331 search_id = base64.b64encode(f':{topic}:{int(time.time())}:transient'.encode()).decode()
332 total_pages = int_or_none(self._get_topic_page(
333 topic, org_id, search_id, api_key, note='Downloading topic info')['total_no_of_pages'])
334 return self.playlist_result(InAdvancePagedList(
335 functools.partial(self._entries, topic, org_id, search_id, api_key, url),
336 total_pages, self._PAGE_SIZE), topic_id, topic)