]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/odnoklassniki.py
[cleanup] Misc
[yt-dlp.git] / yt_dlp / extractor / odnoklassniki.py
1 import urllib.parse
2
3 from .common import InfoExtractor
4 from ..compat import (
5 compat_etree_fromstring,
6 compat_parse_qs,
7 compat_urllib_parse_unquote,
8 compat_urllib_parse_urlparse,
9 )
10 from ..utils import (
11 ExtractorError,
12 HEADRequest,
13 float_or_none,
14 int_or_none,
15 qualities,
16 smuggle_url,
17 traverse_obj,
18 unescapeHTML,
19 unified_strdate,
20 unsmuggle_url,
21 url_or_none,
22 urlencode_postdata,
23 )
24
25
26 class OdnoklassnikiIE(InfoExtractor):
27 _VALID_URL = r'''(?x)
28 https?://
29 (?:(?:www|m|mobile)\.)?
30 (?:odnoklassniki|ok)\.ru/
31 (?:
32 video(?P<embed>embed)?/|
33 web-api/video/moviePlayer/|
34 live/|
35 dk\?.*?st\.mvId=
36 )
37 (?P<id>[\d-]+)
38 '''
39 _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1']
40 _TESTS = [{
41 'note': 'Coub embedded',
42 'url': 'http://ok.ru/video/1484130554189',
43 'info_dict': {
44 'id': '1keok9',
45 'ext': 'mp4',
46 'timestamp': 1545580896,
47 'view_count': int,
48 'thumbnail': r're:^https?://.*\.jpg$',
49 'title': 'Народная забава',
50 'uploader': 'Nevata',
51 'upload_date': '20181223',
52 'age_limit': 0,
53 'uploader_id': 'nevata.s',
54 'like_count': int,
55 'duration': 8.08,
56 'repost_count': int,
57 },
58 }, {
59 'note': 'vk.com embedded',
60 'url': 'https://ok.ru/video/3568183087575',
61 'info_dict': {
62 'id': '-165101755_456243749',
63 'ext': 'mp4',
64 'uploader_id': '-165101755',
65 'duration': 132,
66 'timestamp': 1642869935,
67 'upload_date': '20220122',
68 'thumbnail': str,
69 'title': str,
70 'uploader': str,
71 },
72 'skip': 'vk extractor error',
73 }, {
74 # metadata in JSON, webm_dash with Firefox UA
75 'url': 'http://ok.ru/video/20079905452',
76 'md5': '8f477d8931c531374a3e36daec617b2c',
77 'info_dict': {
78 'id': '20079905452',
79 'ext': 'webm',
80 'title': 'Культура меняет нас (прекрасный ролик!))',
81 'thumbnail': str,
82 'duration': 100,
83 'upload_date': '20141207',
84 'uploader_id': '330537914540',
85 'uploader': 'Виталий Добровольский',
86 'like_count': int,
87 'age_limit': 0,
88 },
89 'params': {
90 'format': 'bv[ext=webm]',
91 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'},
92 },
93 }, {
94 # metadataUrl
95 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
96 'md5': '2bae2f58eefe1b3d26f3926c4a64d2f3',
97 'info_dict': {
98 'id': '63567059965189-0',
99 'ext': 'mp4',
100 'title': 'Девушка без комплексов ...',
101 'thumbnail': str,
102 'duration': 191,
103 'upload_date': '20150518',
104 'uploader_id': '534380003155',
105 'uploader': '☭ Андрей Мещанинов ☭',
106 'like_count': int,
107 'age_limit': 0,
108 'start_time': 5,
109 },
110 'params': {'skip_download': 'm3u8'},
111 }, {
112 # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
113 'url': 'https://ok.ru/video/3952212382174',
114 'md5': '5fb5f83ce16cb212d6bf887282b5da53',
115 'info_dict': {
116 'id': '5axVgHHDBvU',
117 'ext': 'mp4',
118 'title': 'Youtube-dl 101: What is it and HOW to use it! Full Download Walkthrough and Guide',
119 'description': 'md5:b57209eeb9d5c2f20c984dfb58862097',
120 'uploader': 'Lod Mer',
121 'uploader_id': '575186401502',
122 'duration': 1529,
123 'age_limit': 0,
124 'upload_date': '20210405',
125 'comment_count': int,
126 'live_status': 'not_live',
127 'view_count': int,
128 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8',
129 'uploader_url': 'https://www.youtube.com/@MrKewlkid94',
130 'channel_follower_count': int,
131 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'],
132 'channel_id': 'UCVGtvURtEURYHtJFUegdSug',
133 'like_count': int,
134 'availability': 'public',
135 'channel_url': 'https://www.youtube.com/channel/UCVGtvURtEURYHtJFUegdSug',
136 'categories': ['Education'],
137 'playable_in_embed': True,
138 'channel': 'BornToReact',
139 },
140 }, {
141 # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
142 'url': 'http://ok.ru/video/62036049272859-0',
143 'info_dict': {
144 'id': '62036049272859-0',
145 'ext': 'mp4',
146 'title': 'МУЗЫКА ДОЖДЯ .',
147 'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0',
148 'upload_date': '20120106',
149 'uploader_id': '473534735899',
150 'uploader': 'МARINA D',
151 'age_limit': 0,
152 },
153 'params': {
154 'skip_download': True,
155 },
156 'skip': 'Video has not been found',
157 }, {
158 'note': 'Only available in mobile webpage',
159 'url': 'https://m.ok.ru/video/2361249957145',
160 'info_dict': {
161 'id': '2361249957145',
162 'ext': 'mp4',
163 'title': 'Быковское крещение',
164 'duration': 3038.181,
165 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+',
166 },
167 }, {
168 'note': 'subtitles',
169 'url': 'https://ok.ru/video/4249587550747',
170 'info_dict': {
171 'id': '4249587550747',
172 'ext': 'mp4',
173 'title': 'Small Country An African Childhood (2020) (1080p) +subtitle',
174 'uploader': 'Sunflower Movies',
175 'uploader_id': '595802161179',
176 'upload_date': '20220816',
177 'duration': 6728,
178 'age_limit': 0,
179 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+',
180 'like_count': int,
181 'subtitles': dict,
182 },
183 'params': {
184 'skip_download': True,
185 },
186 }, {
187 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
188 'only_matching': True,
189 }, {
190 'url': 'http://www.ok.ru/video/20648036891',
191 'only_matching': True,
192 }, {
193 'url': 'http://www.ok.ru/videoembed/20648036891',
194 'only_matching': True,
195 }, {
196 'url': 'http://m.ok.ru/video/20079905452',
197 'only_matching': True,
198 }, {
199 'url': 'http://mobile.ok.ru/video/20079905452',
200 'only_matching': True,
201 }, {
202 'url': 'https://www.ok.ru/live/484531969818',
203 'only_matching': True,
204 }, {
205 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#',
206 'only_matching': True,
207 }, {
208 # Paid video
209 'url': 'https://ok.ru/video/954886983203',
210 'only_matching': True,
211 }, {
212 'url': 'https://ok.ru/videoembed/2932705602075',
213 'info_dict': {
214 'id': '2932705602075',
215 'ext': 'mp4',
216 'thumbnail': 'https://i.mycdn.me/videoPreview?id=1369902483995&type=37&idx=2&tkn=fqlnoQD_xwq5ovIlKfgNyU08qmM&fn=external_8',
217 'title': 'Boosty для тебя!',
218 'uploader_id': '597811038747',
219 'like_count': 0,
220 'duration': 35,
221 },
222 }]
223
224 _WEBPAGE_TESTS = [{
225 'url': 'https://boosty.to/ikakprosto/posts/56cedaca-b56a-4dfd-b3ed-98c79cfa0167',
226 'info_dict': {
227 'id': '3950343629563',
228 'ext': 'mp4',
229 'thumbnail': 'https://i.mycdn.me/videoPreview?id=2776238394107&type=37&idx=11&tkn=F3ejkUFcpuI4DnMRxrDGcH5YcmM&fn=external_8',
230 'title': 'Заяц Бусти.mp4',
231 'uploader_id': '571368965883',
232 'like_count': 0,
233 'duration': 10444,
234 },
235 'skip': 'Site no longer embeds',
236 }]
237
238 def _clear_cookies(self, cdn_url):
239 # Direct http downloads will fail if CDN cookies are set
240 # so we need to reset them after each format extraction
241 self.cookiejar.clear(domain='.mycdn.me')
242 self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname)
243
244 @classmethod
245 def _extract_embed_urls(cls, url, webpage):
246 for x in super()._extract_embed_urls(url, webpage):
247 yield smuggle_url(x, {'referrer': url})
248
249 def _real_extract(self, url):
250 try:
251 return self._extract_desktop(url)
252 except ExtractorError as e:
253 try:
254 return self._extract_mobile(url)
255 except ExtractorError:
256 # error message of desktop webpage is in English
257 raise e
258
259 def _extract_desktop(self, url):
260 start_time = int_or_none(compat_parse_qs(
261 compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
262
263 url, smuggled = unsmuggle_url(url, {})
264 video_id, is_embed = self._match_valid_url(url).group('id', 'embed')
265 mode = 'videoembed' if is_embed else 'video'
266
267 webpage = self._download_webpage(
268 f'https://ok.ru/{mode}/{video_id}', video_id,
269 note='Downloading desktop webpage',
270 headers={'Referer': smuggled['referrer']} if smuggled.get('referrer') else {})
271
272 error = self._search_regex(
273 r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
274 webpage, 'error', default=None)
275 # Direct link from boosty
276 if (error == 'The author of this video has not been found or is blocked'
277 and not smuggled.get('referrer') and mode == 'videoembed'):
278 return self._extract_desktop(smuggle_url(url, {'referrer': 'https://boosty.to'}))
279 elif error:
280 raise ExtractorError(error, expected=True)
281
282 player = self._parse_json(
283 unescapeHTML(self._search_regex(
284 r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id,
285 webpage, 'player', group='player')),
286 video_id)
287
288 # embedded external player
289 if player.get('isExternalPlayer') and player.get('url'):
290 return self.url_result(player['url'])
291
292 flashvars = player['flashvars']
293
294 metadata = flashvars.get('metadata')
295 if metadata:
296 metadata = self._parse_json(metadata, video_id)
297 else:
298 data = {}
299 st_location = flashvars.get('location')
300 if st_location:
301 data['st.location'] = st_location
302 metadata = self._download_json(
303 compat_urllib_parse_unquote(flashvars['metadataUrl']),
304 video_id, 'Downloading metadata JSON',
305 data=urlencode_postdata(data))
306
307 movie = metadata['movie']
308
309 # Some embedded videos may not contain title in movie dict (e.g.
310 # http://ok.ru/video/62036049272859-0) thus we allow missing title
311 # here and it's going to be extracted later by an extractor that
312 # will process the actual embed.
313 provider = metadata.get('provider')
314 title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title')
315
316 thumbnail = movie.get('poster')
317 duration = int_or_none(movie.get('duration'))
318
319 author = metadata.get('author', {})
320 uploader_id = author.get('id')
321 uploader = author.get('name')
322
323 upload_date = unified_strdate(self._html_search_meta(
324 'ya:ovs:upload_date', webpage, 'upload date', default=None))
325
326 age_limit = None
327 adult = self._html_search_meta(
328 'ya:ovs:adult', webpage, 'age limit', default=None)
329 if adult:
330 age_limit = 18 if adult == 'true' else 0
331
332 like_count = int_or_none(metadata.get('likeCount'))
333
334 subtitles = {}
335 for sub in traverse_obj(metadata, ('movie', 'subtitleTracks', ...), expected_type=dict):
336 sub_url = sub.get('url')
337 if not sub_url:
338 continue
339 subtitles.setdefault(sub.get('language') or 'en', []).append({
340 'url': sub_url,
341 'ext': 'vtt',
342 })
343
344 info = {
345 'id': video_id,
346 'title': title,
347 'thumbnail': thumbnail,
348 'duration': duration,
349 'upload_date': upload_date,
350 'uploader': uploader,
351 'uploader_id': uploader_id,
352 'like_count': like_count,
353 'age_limit': age_limit,
354 'start_time': start_time,
355 'subtitles': subtitles,
356 }
357
358 # pladform
359 if provider == 'OPEN_GRAPH':
360 info.update({
361 '_type': 'url_transparent',
362 'url': movie['contentId'],
363 })
364 return info
365
366 if provider == 'USER_YOUTUBE':
367 info.update({
368 '_type': 'url_transparent',
369 'url': movie['contentId'],
370 })
371 return info
372
373 assert title
374 if provider == 'LIVE_TV_APP':
375 info['title'] = title
376
377 quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7'))
378
379 formats = [{
380 'url': f['url'],
381 'ext': 'mp4',
382 'format_id': f.get('name'),
383 } for f in traverse_obj(metadata, ('videos', lambda _, v: url_or_none(v['url'])))]
384
385 m3u8_url = traverse_obj(metadata, 'hlsManifestUrl', 'ondemandHls')
386 if m3u8_url:
387 formats.extend(self._extract_m3u8_formats(
388 m3u8_url, video_id, 'mp4', 'm3u8_native',
389 m3u8_id='hls', fatal=False))
390 self._clear_cookies(m3u8_url)
391
392 for mpd_id, mpd_key in [('dash', 'ondemandDash'), ('webm', 'metadataWebmUrl')]:
393 mpd_url = metadata.get(mpd_key)
394 if mpd_url:
395 formats.extend(self._extract_mpd_formats(
396 mpd_url, video_id, mpd_id=mpd_id, fatal=False))
397 self._clear_cookies(mpd_url)
398
399 dash_manifest = metadata.get('metadataEmbedded')
400 if dash_manifest:
401 formats.extend(self._parse_mpd_formats(
402 compat_etree_fromstring(dash_manifest), 'mpd'))
403
404 for fmt in formats:
405 fmt_type = self._search_regex(
406 r'\btype[/=](\d)', fmt['url'],
407 'format type', default=None)
408 if fmt_type:
409 fmt['quality'] = quality(fmt_type)
410
411 # Live formats
412 m3u8_url = metadata.get('hlsMasterPlaylistUrl')
413 if m3u8_url:
414 formats.extend(self._extract_m3u8_formats(
415 m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
416 self._clear_cookies(m3u8_url)
417 rtmp_url = metadata.get('rtmpUrl')
418 if rtmp_url:
419 formats.append({
420 'url': rtmp_url,
421 'format_id': 'rtmp',
422 'ext': 'flv',
423 })
424
425 if not formats:
426 payment_info = metadata.get('paymentInfo')
427 if payment_info:
428 self.raise_no_formats('This video is paid, subscribe to download it', expected=True)
429
430 info['formats'] = formats
431 return info
432
433 def _extract_mobile(self, url):
434 video_id = self._match_id(url)
435
436 webpage = self._download_webpage(
437 'http://m.ok.ru/video/%s' % video_id, video_id,
438 note='Downloading mobile webpage')
439
440 error = self._search_regex(
441 r'видео</a>\s*<div\s+class="empty">(.+?)</div>',
442 webpage, 'error', default=None)
443 if error:
444 raise ExtractorError(error, expected=True)
445
446 json_data = self._search_regex(
447 r'data-video="(.+?)"', webpage, 'json data')
448 json_data = self._parse_json(unescapeHTML(json_data), video_id) or {}
449
450 redirect_url = self._request_webpage(HEADRequest(
451 json_data['videoSrc']), video_id, 'Requesting download URL').geturl()
452 self._clear_cookies(redirect_url)
453
454 return {
455 'id': video_id,
456 'title': json_data.get('videoName'),
457 'duration': float_or_none(json_data.get('videoDuration'), scale=1000),
458 'thumbnail': json_data.get('videoPosterSrc'),
459 'formats': [{
460 'format_id': 'mobile',
461 'url': redirect_url,
462 'ext': 'mp4',
463 }]
464 }