yt_dlp/extractor/odnoklassniki.py

   1 from .common import InfoExtractor
   2 from ..compat import (
   3     compat_etree_fromstring,
   4     compat_parse_qs,
   5     compat_urllib_parse_unquote,
   6     compat_urllib_parse_urlparse,
   7 )
   8 from ..utils import (
   9     ExtractorError,
  10     float_or_none,
  11     int_or_none,
  12     qualities,
  13     smuggle_url,
  14     unescapeHTML,
  15     unified_strdate,
  16     unsmuggle_url,
  17     urlencode_postdata,
  18 )
  19
  20
  21 class OdnoklassnikiIE(InfoExtractor):
  22     _VALID_URL = r'''(?x)
  23                 https?://
  24                     (?:(?:www|m|mobile)\.)?
  25                     (?:odnoklassniki|ok)\.ru/
  26                     (?:
  27                         video(?P<embed>embed)?/|
  28                         web-api/video/moviePlayer/|
  29                         live/|
  30                         dk\?.*?st\.mvId=
  31                     )
  32                     (?P<id>[\d-]+)
  33                 '''
  34     _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1']
  35     _TESTS = [{
  36         'note': 'Coub embedded',
  37         'url': 'http://ok.ru/video/1484130554189',
  38         'info_dict': {
  39             'id': '1keok9',
  40             'ext': 'mp4',
  41             'timestamp': 1545580896,
  42             'view_count': int,
  43             'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg',
  44             'title': 'Народная забава',
  45             'uploader': 'Nevata',
  46             'upload_date': '20181223',
  47             'age_limit': 0,
  48             'uploader_id': 'nevata.s',
  49             'like_count': int,
  50             'duration': 8.08,
  51             'repost_count': int,
  52         },
  53     }, {
  54         'note': 'vk.com embedded',
  55         'url': 'https://ok.ru/video/3568183087575',
  56         'info_dict': {
  57             'id': '-165101755_456243749',
  58             'ext': 'mp4',
  59             'uploader_id': '-165101755',
  60             'duration': 132,
  61             'timestamp': 1642869935,
  62             'upload_date': '20220122',
  63             'thumbnail': str,
  64             'title': str,
  65             'uploader': str,
  66         },
  67     }, {
  68         # metadata in JSON
  69         'url': 'http://ok.ru/video/20079905452',
  70         'md5': '5d2b64756e2af296e3b383a0bc02a6aa',
  71         'info_dict': {
  72             'id': '20079905452',
  73             'ext': 'mp4',
  74             'title': 'Культура меняет нас (прекрасный ролик!))',
  75             'thumbnail': str,
  76             'duration': 100,
  77             'upload_date': '20141207',
  78             'uploader_id': '330537914540',
  79             'uploader': 'Виталий Добровольский',
  80             'like_count': int,
  81             'age_limit': 0,
  82         },
  83     }, {
  84         # metadataUrl
  85         'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
  86         'md5': 'f8c951122516af72e6e6ffdd3c41103b',
  87         'info_dict': {
  88             'id': '63567059965189-0',
  89             'ext': 'mp4',
  90             'title': 'Девушка без комплексов ...',
  91             'thumbnail': str,
  92             'duration': 191,
  93             'upload_date': '20150518',
  94             'uploader_id': '534380003155',
  95             'uploader': '☭ Андрей Мещанинов ☭',
  96             'like_count': int,
  97             'age_limit': 0,
  98             'start_time': 5,
  99         },
 100     }, {
 101         # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
 102         'url': 'https://ok.ru/video/3952212382174',
 103         'md5': '91749d0bd20763a28d083fa335bbd37a',
 104         'info_dict': {
 105             'id': '5axVgHHDBvU',
 106             'ext': 'mp4',
 107             'title': 'Youtube-dl 101: What is it and HOW to use it! Full Download Walkthrough and Guide',
 108             'description': 'md5:b57209eeb9d5c2f20c984dfb58862097',
 109             'uploader': 'Lod Mer',
 110             'uploader_id': '575186401502',
 111             'duration': 1529,
 112             'age_limit': 0,
 113             'upload_date': '20210405',
 114             'comment_count': int,
 115             'live_status': 'not_live',
 116             'view_count': int,
 117             'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8',
 118             'uploader_url': 'http://www.youtube.com/user/MrKewlkid94',
 119             'channel_follower_count': int,
 120             'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'],
 121             'channel_id': 'UCVGtvURtEURYHtJFUegdSug',
 122             'like_count': int,
 123             'availability': 'public',
 124             'channel_url': 'https://www.youtube.com/channel/UCVGtvURtEURYHtJFUegdSug',
 125             'categories': ['Education'],
 126             'playable_in_embed': True,
 127             'channel': 'BornToReact',
 128         },
 129     }, {
 130         # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
 131         'url': 'http://ok.ru/video/62036049272859-0',
 132         'info_dict': {
 133             'id': '62036049272859-0',
 134             'ext': 'mp4',
 135             'title': 'МУЗЫКА     ДОЖДЯ .',
 136             'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0',
 137             'upload_date': '20120106',
 138             'uploader_id': '473534735899',
 139             'uploader': 'МARINA D',
 140             'age_limit': 0,
 141         },
 142         'params': {
 143             'skip_download': True,
 144         },
 145         'skip': 'Video has not been found',
 146     }, {
 147         # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading
 148         'note': 'Only available in mobile webpage',
 149         'url': 'https://m.ok.ru/video/2361249957145',
 150         'info_dict': {
 151             'id': '2361249957145',
 152             'ext': 'mp4',
 153             'title': 'Быковское крещение',
 154             'duration': 3038.181,
 155         },
 156     }, {
 157         'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
 158         'only_matching': True,
 159     }, {
 160         'url': 'http://www.ok.ru/video/20648036891',
 161         'only_matching': True,
 162     }, {
 163         'url': 'http://www.ok.ru/videoembed/20648036891',
 164         'only_matching': True,
 165     }, {
 166         'url': 'http://m.ok.ru/video/20079905452',
 167         'only_matching': True,
 168     }, {
 169         'url': 'http://mobile.ok.ru/video/20079905452',
 170         'only_matching': True,
 171     }, {
 172         'url': 'https://www.ok.ru/live/484531969818',
 173         'only_matching': True,
 174     }, {
 175         'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#',
 176         'only_matching': True,
 177     }, {
 178         # Paid video
 179         'url': 'https://ok.ru/video/954886983203',
 180         'only_matching': True,
 181     }, {
 182         'url': 'https://ok.ru/videoembed/2932705602075',
 183         'info_dict': {
 184             'id': '2932705602075',
 185             'ext': 'mp4',
 186             'thumbnail': 'https://i.mycdn.me/videoPreview?id=1369902483995&type=37&idx=2&tkn=fqlnoQD_xwq5ovIlKfgNyU08qmM&fn=external_8',
 187             'title': 'Boosty для тебя!',
 188             'uploader_id': '597811038747',
 189             'like_count': 0,
 190             'duration': 35,
 191         },
 192     }]
 193
 194     _WEBPAGE_TESTS = [{
 195         'url': 'https://boosty.to/ikakprosto/posts/56cedaca-b56a-4dfd-b3ed-98c79cfa0167',
 196         'info_dict': {
 197             'id': '3950343629563',
 198             'ext': 'mp4',
 199             'thumbnail': 'https://i.mycdn.me/videoPreview?id=2776238394107&type=37&idx=11&tkn=F3ejkUFcpuI4DnMRxrDGcH5YcmM&fn=external_8',
 200             'title': 'Заяц Бусти.mp4',
 201             'uploader_id': '571368965883',
 202             'like_count': 0,
 203             'duration': 10444,
 204         },
 205     }]
 206
 207     @classmethod
 208     def _extract_embed_urls(cls, url, webpage):
 209         for x in super()._extract_embed_urls(url, webpage):
 210             yield smuggle_url(x, {'referrer': url})
 211
 212     def _real_extract(self, url):
 213         try:
 214             return self._extract_desktop(url)
 215         except ExtractorError as e:
 216             try:
 217                 return self._extract_mobile(url)
 218             except ExtractorError:
 219                 # error message of desktop webpage is in English
 220                 raise e
 221
 222     def _extract_desktop(self, url):
 223         start_time = int_or_none(compat_parse_qs(
 224             compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
 225
 226         url, smuggled = unsmuggle_url(url, {})
 227         video_id, is_embed = self._match_valid_url(url).group('id', 'embed')
 228         mode = 'videoembed' if is_embed else 'video'
 229
 230         webpage = self._download_webpage(
 231             f'https://ok.ru/{mode}/{video_id}', video_id,
 232             note='Downloading desktop webpage',
 233             headers={'Referer': smuggled['referrer']} if smuggled.get('referrer') else {})
 234
 235         error = self._search_regex(
 236             r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
 237             webpage, 'error', default=None)
 238         # Direct link from boosty
 239         if (error == 'The author of this video has not been found or is blocked'
 240                 and not smuggled.get('referrer') and mode == 'videoembed'):
 241             return self._extract_desktop(smuggle_url(url, {'referrer': 'https://boosty.to'}))
 242         elif error:
 243             raise ExtractorError(error, expected=True)
 244
 245         player = self._parse_json(
 246             unescapeHTML(self._search_regex(
 247                 r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id,
 248                 webpage, 'player', group='player')),
 249             video_id)
 250
 251         # embedded external player
 252         if player.get('isExternalPlayer') and player.get('url'):
 253             return self.url_result(player['url'])
 254
 255         flashvars = player['flashvars']
 256
 257         metadata = flashvars.get('metadata')
 258         if metadata:
 259             metadata = self._parse_json(metadata, video_id)
 260         else:
 261             data = {}
 262             st_location = flashvars.get('location')
 263             if st_location:
 264                 data['st.location'] = st_location
 265             metadata = self._download_json(
 266                 compat_urllib_parse_unquote(flashvars['metadataUrl']),
 267                 video_id, 'Downloading metadata JSON',
 268                 data=urlencode_postdata(data))
 269
 270         movie = metadata['movie']
 271
 272         # Some embedded videos may not contain title in movie dict (e.g.
 273         # http://ok.ru/video/62036049272859-0) thus we allow missing title
 274         # here and it's going to be extracted later by an extractor that
 275         # will process the actual embed.
 276         provider = metadata.get('provider')
 277         title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title')
 278
 279         thumbnail = movie.get('poster')
 280         duration = int_or_none(movie.get('duration'))
 281
 282         author = metadata.get('author', {})
 283         uploader_id = author.get('id')
 284         uploader = author.get('name')
 285
 286         upload_date = unified_strdate(self._html_search_meta(
 287             'ya:ovs:upload_date', webpage, 'upload date', default=None))
 288
 289         age_limit = None
 290         adult = self._html_search_meta(
 291             'ya:ovs:adult', webpage, 'age limit', default=None)
 292         if adult:
 293             age_limit = 18 if adult == 'true' else 0
 294
 295         like_count = int_or_none(metadata.get('likeCount'))
 296
 297         info = {
 298             'id': video_id,
 299             'title': title,
 300             'thumbnail': thumbnail,
 301             'duration': duration,
 302             'upload_date': upload_date,
 303             'uploader': uploader,
 304             'uploader_id': uploader_id,
 305             'like_count': like_count,
 306             'age_limit': age_limit,
 307             'start_time': start_time,
 308         }
 309
 310         # pladform
 311         if provider == 'OPEN_GRAPH':
 312             info.update({
 313                 '_type': 'url_transparent',
 314                 'url': movie['contentId'],
 315             })
 316             return info
 317
 318         if provider == 'USER_YOUTUBE':
 319             info.update({
 320                 '_type': 'url_transparent',
 321                 'url': movie['contentId'],
 322             })
 323             return info
 324
 325         assert title
 326         if provider == 'LIVE_TV_APP':
 327             info['title'] = title
 328
 329         quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7'))
 330
 331         formats = [{
 332             'url': f['url'],
 333             'ext': 'mp4',
 334             'format_id': f['name'],
 335         } for f in metadata['videos']]
 336
 337         m3u8_url = metadata.get('hlsManifestUrl')
 338         if m3u8_url:
 339             formats.extend(self._extract_m3u8_formats(
 340                 m3u8_url, video_id, 'mp4', 'm3u8_native',
 341                 m3u8_id='hls', fatal=False))
 342
 343         dash_manifest = metadata.get('metadataEmbedded')
 344         if dash_manifest:
 345             formats.extend(self._parse_mpd_formats(
 346                 compat_etree_fromstring(dash_manifest), 'mpd'))
 347
 348         for fmt in formats:
 349             fmt_type = self._search_regex(
 350                 r'\btype[/=](\d)', fmt['url'],
 351                 'format type', default=None)
 352             if fmt_type:
 353                 fmt['quality'] = quality(fmt_type)
 354
 355         # Live formats
 356         m3u8_url = metadata.get('hlsMasterPlaylistUrl')
 357         if m3u8_url:
 358             formats.extend(self._extract_m3u8_formats(
 359                 m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
 360         rtmp_url = metadata.get('rtmpUrl')
 361         if rtmp_url:
 362             formats.append({
 363                 'url': rtmp_url,
 364                 'format_id': 'rtmp',
 365                 'ext': 'flv',
 366             })
 367
 368         if not formats:
 369             payment_info = metadata.get('paymentInfo')
 370             if payment_info:
 371                 self.raise_no_formats('This video is paid, subscribe to download it', expected=True)
 372
 373         self._sort_formats(formats)
 374
 375         info['formats'] = formats
 376         return info
 377
 378     def _extract_mobile(self, url):
 379         video_id = self._match_id(url)
 380
 381         webpage = self._download_webpage(
 382             'http://m.ok.ru/video/%s' % video_id, video_id,
 383             note='Downloading mobile webpage')
 384
 385         error = self._search_regex(
 386             r'видео</a>\s*<div\s+class="empty">(.+?)</div>',
 387             webpage, 'error', default=None)
 388         if error:
 389             raise ExtractorError(error, expected=True)
 390
 391         json_data = self._search_regex(
 392             r'data-video="(.+?)"', webpage, 'json data')
 393         json_data = self._parse_json(unescapeHTML(json_data), video_id) or {}
 394
 395         return {
 396             'id': video_id,
 397             'title': json_data.get('videoName'),
 398             'duration': float_or_none(json_data.get('videoDuration'), scale=1000),
 399             'thumbnail': json_data.get('videoPosterSrc'),
 400             'formats': [{
 401                 'format_id': 'mobile',
 402                 'url': json_data.get('videoSrc'),
 403                 'ext': 'mp4',
 404             }]
 405         }