]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/odnoklassniki.py
0d0ad0bb86b2e5c46a699983d344c4b6e3159ded
[yt-dlp.git] / yt_dlp / extractor / odnoklassniki.py
1 import urllib.parse
2
3 from .common import InfoExtractor
4 from ..compat import (
5 compat_etree_fromstring,
6 compat_parse_qs,
7 compat_urllib_parse_unquote,
8 compat_urllib_parse_urlparse,
9 )
10 from ..utils import (
11 ExtractorError,
12 HEADRequest,
13 float_or_none,
14 int_or_none,
15 qualities,
16 smuggle_url,
17 traverse_obj,
18 unescapeHTML,
19 unified_strdate,
20 unsmuggle_url,
21 url_or_none,
22 urlencode_postdata,
23 )
24
25
26 class OdnoklassnikiIE(InfoExtractor):
27 _VALID_URL = r'''(?x)
28 https?://
29 (?:(?:www|m|mobile)\.)?
30 (?:odnoklassniki|ok)\.ru/
31 (?:
32 video(?P<embed>embed)?/|
33 web-api/video/moviePlayer/|
34 live/|
35 dk\?.*?st\.mvId=
36 )
37 (?P<id>[\d-]+)
38 '''
39 _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1']
40 _TESTS = [{
41 'note': 'Coub embedded',
42 'url': 'http://ok.ru/video/1484130554189',
43 'info_dict': {
44 'id': '1keok9',
45 'ext': 'mp4',
46 'timestamp': 1545580896,
47 'view_count': int,
48 'thumbnail': r're:^https?://.*\.jpg$',
49 'title': 'Народная забава',
50 'uploader': 'Nevata',
51 'upload_date': '20181223',
52 'age_limit': 0,
53 'uploader_id': 'nevata.s',
54 'like_count': int,
55 'duration': 8.08,
56 'repost_count': int,
57 },
58 }, {
59 'note': 'vk.com embedded',
60 'url': 'https://ok.ru/video/3568183087575',
61 'info_dict': {
62 'id': '-165101755_456243749',
63 'ext': 'mp4',
64 'uploader_id': '-165101755',
65 'duration': 132,
66 'timestamp': 1642869935,
67 'upload_date': '20220122',
68 'thumbnail': str,
69 'title': str,
70 'uploader': str,
71 },
72 'skip': 'vk extractor error',
73 }, {
74 # metadata in JSON, webm_dash with Firefox UA
75 'url': 'http://ok.ru/video/20079905452',
76 'md5': '8f477d8931c531374a3e36daec617b2c',
77 'info_dict': {
78 'id': '20079905452',
79 'ext': 'webm',
80 'title': 'Культура меняет нас (прекрасный ролик!))',
81 'thumbnail': str,
82 'duration': 100,
83 'upload_date': '20141207',
84 'uploader_id': '330537914540',
85 'uploader': 'Виталий Добровольский',
86 'like_count': int,
87 'age_limit': 0,
88 },
89 'params': {
90 'format': 'bv[ext=webm]',
91 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'},
92 },
93 }, {
94 # metadataUrl
95 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
96 'md5': '2bae2f58eefe1b3d26f3926c4a64d2f3',
97 'info_dict': {
98 'id': '63567059965189-0',
99 'ext': 'mp4',
100 'title': 'Девушка без комплексов ...',
101 'thumbnail': str,
102 'duration': 191,
103 'upload_date': '20150518',
104 'uploader_id': '534380003155',
105 'uploader': '☭ Андрей Мещанинов ☭',
106 'like_count': int,
107 'age_limit': 0,
108 'start_time': 5,
109 },
110 'params': {'skip_download': 'm3u8'},
111 }, {
112 # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
113 'url': 'https://ok.ru/video/3952212382174',
114 'md5': '5fb5f83ce16cb212d6bf887282b5da53',
115 'info_dict': {
116 'id': '5axVgHHDBvU',
117 'ext': 'mp4',
118 'title': 'Youtube-dl 101: What is it and HOW to use it! Full Download Walkthrough and Guide',
119 'description': 'md5:b57209eeb9d5c2f20c984dfb58862097',
120 'uploader': 'Lod Mer',
121 'uploader_id': '575186401502',
122 'duration': 1529,
123 'age_limit': 0,
124 'upload_date': '20210405',
125 'comment_count': int,
126 'live_status': 'not_live',
127 'view_count': int,
128 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8',
129 'uploader_url': 'https://www.youtube.com/@MrKewlkid94',
130 'channel_follower_count': int,
131 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'],
132 'channel_id': 'UCVGtvURtEURYHtJFUegdSug',
133 'like_count': int,
134 'availability': 'public',
135 'channel_url': 'https://www.youtube.com/channel/UCVGtvURtEURYHtJFUegdSug',
136 'categories': ['Education'],
137 'playable_in_embed': True,
138 'channel': 'BornToReact',
139 },
140 }, {
141 # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
142 'url': 'http://ok.ru/video/62036049272859-0',
143 'info_dict': {
144 'id': '62036049272859-0',
145 'ext': 'mp4',
146 'title': 'МУЗЫКА ДОЖДЯ .',
147 'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0',
148 'upload_date': '20120106',
149 'uploader_id': '473534735899',
150 'uploader': 'МARINA D',
151 'age_limit': 0,
152 },
153 'params': {
154 'skip_download': True,
155 },
156 'skip': 'Video has not been found',
157 }, {
158 'note': 'Only available in mobile webpage',
159 'url': 'https://m.ok.ru/video/2361249957145',
160 'info_dict': {
161 'id': '2361249957145',
162 'ext': 'mp4',
163 'title': 'Быковское крещение',
164 'duration': 3038.181,
165 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+',
166 },
167 }, {
168 'note': 'subtitles',
169 'url': 'https://ok.ru/video/4249587550747',
170 'info_dict': {
171 'id': '4249587550747',
172 'ext': 'mp4',
173 'title': 'Small Country An African Childhood (2020) (1080p) +subtitle',
174 'uploader': 'Sunflower Movies',
175 'uploader_id': '595802161179',
176 'upload_date': '20220816',
177 'duration': 6728,
178 'age_limit': 0,
179 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+',
180 'like_count': int,
181 'subtitles': dict,
182 },
183 'params': {
184 'skip_download': True,
185 },
186 }, {
187 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
188 'only_matching': True,
189 }, {
190 'url': 'http://www.ok.ru/video/20648036891',
191 'only_matching': True,
192 }, {
193 'url': 'http://www.ok.ru/videoembed/20648036891',
194 'only_matching': True,
195 }, {
196 'url': 'http://m.ok.ru/video/20079905452',
197 'only_matching': True,
198 }, {
199 'url': 'http://mobile.ok.ru/video/20079905452',
200 'only_matching': True,
201 }, {
202 'url': 'https://www.ok.ru/live/484531969818',
203 'only_matching': True,
204 }, {
205 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#',
206 'only_matching': True,
207 }, {
208 # Paid video
209 'url': 'https://ok.ru/video/954886983203',
210 'only_matching': True,
211 }, {
212 'url': 'https://ok.ru/videoembed/2932705602075',
213 'info_dict': {
214 'id': '2932705602075',
215 'ext': 'mp4',
216 'thumbnail': 'https://i.mycdn.me/videoPreview?id=1369902483995&type=37&idx=2&tkn=fqlnoQD_xwq5ovIlKfgNyU08qmM&fn=external_8',
217 'title': 'Boosty для тебя!',
218 'uploader_id': '597811038747',
219 'like_count': 0,
220 'duration': 35,
221 },
222 }]
223
224 _WEBPAGE_TESTS = [{
225 'url': 'https://boosty.to/ikakprosto/posts/56cedaca-b56a-4dfd-b3ed-98c79cfa0167',
226 'info_dict': {
227 'id': '3950343629563',
228 'ext': 'mp4',
229 'thumbnail': 'https://i.mycdn.me/videoPreview?id=2776238394107&type=37&idx=11&tkn=F3ejkUFcpuI4DnMRxrDGcH5YcmM&fn=external_8',
230 'title': 'Заяц Бусти.mp4',
231 'uploader_id': '571368965883',
232 'like_count': 0,
233 'duration': 10444,
234 },
235 'skip': 'Site no longer embeds',
236 }]
237
238 def _clear_cookies(self, cdn_url):
239 # Direct http downloads will fail if CDN cookies are set
240 # so we need to reset them after each format extraction
241 if self._get_cookies('https://notarealsubdomain.mycdn.me/'):
242 self.cookiejar.clear(domain='.mycdn.me')
243 if self._get_cookies(cdn_url):
244 self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname)
245
246 @classmethod
247 def _extract_embed_urls(cls, url, webpage):
248 for x in super()._extract_embed_urls(url, webpage):
249 yield smuggle_url(x, {'referrer': url})
250
251 def _real_extract(self, url):
252 try:
253 return self._extract_desktop(url)
254 except ExtractorError as e:
255 try:
256 return self._extract_mobile(url)
257 except ExtractorError:
258 # error message of desktop webpage is in English
259 raise e
260
261 def _extract_desktop(self, url):
262 start_time = int_or_none(compat_parse_qs(
263 compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
264
265 url, smuggled = unsmuggle_url(url, {})
266 video_id, is_embed = self._match_valid_url(url).group('id', 'embed')
267 mode = 'videoembed' if is_embed else 'video'
268
269 webpage = self._download_webpage(
270 f'https://ok.ru/{mode}/{video_id}', video_id,
271 note='Downloading desktop webpage',
272 headers={'Referer': smuggled['referrer']} if smuggled.get('referrer') else {})
273
274 error = self._search_regex(
275 r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
276 webpage, 'error', default=None)
277 # Direct link from boosty
278 if (error == 'The author of this video has not been found or is blocked'
279 and not smuggled.get('referrer') and mode == 'videoembed'):
280 return self._extract_desktop(smuggle_url(url, {'referrer': 'https://boosty.to'}))
281 elif error:
282 raise ExtractorError(error, expected=True)
283
284 player = self._parse_json(
285 unescapeHTML(self._search_regex(
286 r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id,
287 webpage, 'player', group='player')),
288 video_id)
289
290 # embedded external player
291 if player.get('isExternalPlayer') and player.get('url'):
292 return self.url_result(player['url'])
293
294 flashvars = player['flashvars']
295
296 metadata = flashvars.get('metadata')
297 if metadata:
298 metadata = self._parse_json(metadata, video_id)
299 else:
300 data = {}
301 st_location = flashvars.get('location')
302 if st_location:
303 data['st.location'] = st_location
304 metadata = self._download_json(
305 compat_urllib_parse_unquote(flashvars['metadataUrl']),
306 video_id, 'Downloading metadata JSON',
307 data=urlencode_postdata(data))
308
309 movie = metadata['movie']
310
311 # Some embedded videos may not contain title in movie dict (e.g.
312 # http://ok.ru/video/62036049272859-0) thus we allow missing title
313 # here and it's going to be extracted later by an extractor that
314 # will process the actual embed.
315 provider = metadata.get('provider')
316 title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title')
317
318 thumbnail = movie.get('poster')
319 duration = int_or_none(movie.get('duration'))
320
321 author = metadata.get('author', {})
322 uploader_id = author.get('id')
323 uploader = author.get('name')
324
325 upload_date = unified_strdate(self._html_search_meta(
326 'ya:ovs:upload_date', webpage, 'upload date', default=None))
327
328 age_limit = None
329 adult = self._html_search_meta(
330 'ya:ovs:adult', webpage, 'age limit', default=None)
331 if adult:
332 age_limit = 18 if adult == 'true' else 0
333
334 like_count = int_or_none(metadata.get('likeCount'))
335
336 subtitles = {}
337 for sub in traverse_obj(metadata, ('movie', 'subtitleTracks', ...), expected_type=dict):
338 sub_url = sub.get('url')
339 if not sub_url:
340 continue
341 subtitles.setdefault(sub.get('language') or 'en', []).append({
342 'url': sub_url,
343 'ext': 'vtt',
344 })
345
346 info = {
347 'id': video_id,
348 'title': title,
349 'thumbnail': thumbnail,
350 'duration': duration,
351 'upload_date': upload_date,
352 'uploader': uploader,
353 'uploader_id': uploader_id,
354 'like_count': like_count,
355 'age_limit': age_limit,
356 'start_time': start_time,
357 'subtitles': subtitles,
358 }
359
360 # pladform
361 if provider == 'OPEN_GRAPH':
362 info.update({
363 '_type': 'url_transparent',
364 'url': movie['contentId'],
365 })
366 return info
367
368 if provider == 'USER_YOUTUBE':
369 info.update({
370 '_type': 'url_transparent',
371 'url': movie['contentId'],
372 })
373 return info
374
375 assert title
376 if provider == 'LIVE_TV_APP':
377 info['title'] = title
378
379 quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7'))
380
381 formats = [{
382 'url': f['url'],
383 'ext': 'mp4',
384 'format_id': f.get('name'),
385 } for f in traverse_obj(metadata, ('videos', lambda _, v: url_or_none(v['url'])))]
386
387 m3u8_url = traverse_obj(metadata, 'hlsManifestUrl', 'ondemandHls')
388 if m3u8_url:
389 formats.extend(self._extract_m3u8_formats(
390 m3u8_url, video_id, 'mp4', 'm3u8_native',
391 m3u8_id='hls', fatal=False))
392 self._clear_cookies(m3u8_url)
393
394 for mpd_id, mpd_key in [('dash', 'ondemandDash'), ('webm', 'metadataWebmUrl')]:
395 mpd_url = metadata.get(mpd_key)
396 if mpd_url:
397 formats.extend(self._extract_mpd_formats(
398 mpd_url, video_id, mpd_id=mpd_id, fatal=False))
399 self._clear_cookies(mpd_url)
400
401 dash_manifest = metadata.get('metadataEmbedded')
402 if dash_manifest:
403 formats.extend(self._parse_mpd_formats(
404 compat_etree_fromstring(dash_manifest), 'mpd'))
405
406 for fmt in formats:
407 fmt_type = self._search_regex(
408 r'\btype[/=](\d)', fmt['url'],
409 'format type', default=None)
410 if fmt_type:
411 fmt['quality'] = quality(fmt_type)
412
413 # Live formats
414 m3u8_url = metadata.get('hlsMasterPlaylistUrl')
415 if m3u8_url:
416 formats.extend(self._extract_m3u8_formats(
417 m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
418 self._clear_cookies(m3u8_url)
419 rtmp_url = metadata.get('rtmpUrl')
420 if rtmp_url:
421 formats.append({
422 'url': rtmp_url,
423 'format_id': 'rtmp',
424 'ext': 'flv',
425 })
426
427 if not formats:
428 payment_info = metadata.get('paymentInfo')
429 if payment_info:
430 self.raise_no_formats('This video is paid, subscribe to download it', expected=True)
431
432 info['formats'] = formats
433 return info
434
435 def _extract_mobile(self, url):
436 video_id = self._match_id(url)
437
438 webpage = self._download_webpage(
439 'http://m.ok.ru/video/%s' % video_id, video_id,
440 note='Downloading mobile webpage')
441
442 error = self._search_regex(
443 r'видео</a>\s*<div\s+class="empty">(.+?)</div>',
444 webpage, 'error', default=None)
445 if error:
446 raise ExtractorError(error, expected=True)
447
448 json_data = self._search_regex(
449 r'data-video="(.+?)"', webpage, 'json data')
450 json_data = self._parse_json(unescapeHTML(json_data), video_id) or {}
451
452 redirect_url = self._request_webpage(HEADRequest(
453 json_data['videoSrc']), video_id, 'Requesting download URL').geturl()
454 self._clear_cookies(redirect_url)
455
456 return {
457 'id': video_id,
458 'title': json_data.get('videoName'),
459 'duration': float_or_none(json_data.get('videoDuration'), scale=1000),
460 'thumbnail': json_data.get('videoPosterSrc'),
461 'formats': [{
462 'format_id': 'mobile',
463 'url': redirect_url,
464 'ext': 'mp4',
465 }]
466 }