yt_dlp/extractor/odnoklassniki.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..compat import (
   5     compat_etree_fromstring,
   6     compat_parse_qs,
   7     compat_urllib_parse_unquote,
   8     compat_urllib_parse_urlparse,
   9 )
  10 from ..utils import (
  11     ExtractorError,
  12     float_or_none,
  13     unified_strdate,
  14     int_or_none,
  15     qualities,
  16     unescapeHTML,
  17     urlencode_postdata,
  18 )
  19
  20
  21 class OdnoklassnikiIE(InfoExtractor):
  22     _VALID_URL = r'''(?x)
  23                 https?://
  24                     (?:(?:www|m|mobile)\.)?
  25                     (?:odnoklassniki|ok)\.ru/
  26                     (?:
  27                         video(?:embed)?/|
  28                         web-api/video/moviePlayer/|
  29                         live/|
  30                         dk\?.*?st\.mvId=
  31                     )
  32                     (?P<id>[\d-]+)
  33                 '''
  34     _TESTS = [{
  35         'note': 'Coub embedded',
  36         'url': 'http://ok.ru/video/1484130554189',
  37         'info_dict': {
  38             'id': '1keok9',
  39             'ext': 'mp4',
  40             'timestamp': 1545580896,
  41             'view_count': int,
  42             'thumbnail': 'https://coub-anubis-a.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg',
  43             'title': 'Народная забава',
  44             'uploader': 'Nevata',
  45             'upload_date': '20181223',
  46             'age_limit': 0,
  47             'uploader_id': 'nevata.s',
  48             'like_count': int,
  49             'duration': 8.08,
  50             'repost_count': int,
  51         },
  52     }, {
  53         'note': 'vk.com embedded',
  54         'url': 'https://ok.ru/video/3568183087575',
  55         'info_dict': {
  56             'id': '-165101755_456243749',
  57             'ext': 'mp4',
  58             'uploader_id': '-165101755',
  59             'duration': 132,
  60             'timestamp': 1642869935,
  61             'upload_date': '20220122',
  62             'thumbnail': str,
  63             'title': str,
  64             'uploader': str,
  65         },
  66     }, {
  67         # metadata in JSON
  68         'url': 'http://ok.ru/video/20079905452',
  69         'md5': '0b62089b479e06681abaaca9d204f152',
  70         'info_dict': {
  71             'id': '20079905452',
  72             'ext': 'mp4',
  73             'title': 'Культура меняет нас (прекрасный ролик!))',
  74             'duration': 100,
  75             'upload_date': '20141207',
  76             'uploader_id': '330537914540',
  77             'uploader': 'Виталий Добровольский',
  78             'like_count': int,
  79             'age_limit': 0,
  80         },
  81     }, {
  82         # metadataUrl
  83         'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
  84         'md5': '6ff470ea2dd51d5d18c295a355b0b6bc',
  85         'info_dict': {
  86             'id': '63567059965189-0',
  87             'ext': 'mp4',
  88             'title': 'Девушка без комплексов ...',
  89             'duration': 191,
  90             'upload_date': '20150518',
  91             'uploader_id': '534380003155',
  92             'uploader': '☭ Андрей Мещанинов ☭',
  93             'like_count': int,
  94             'age_limit': 0,
  95             'start_time': 5,
  96         },
  97     }, {
  98         # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
  99         'url': 'http://ok.ru/video/64211978996595-1',
 100         'md5': '2f206894ffb5dbfcce2c5a14b909eea5',
 101         'info_dict': {
 102             'id': 'V_VztHT5BzY',
 103             'ext': 'mp4',
 104             'title': 'Космическая среда от 26 августа 2015',
 105             'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0',
 106             'duration': 440,
 107             'upload_date': '20150826',
 108             'uploader_id': 'tvroscosmos',
 109             'uploader': 'Телестудия Роскосмоса',
 110             'age_limit': 0,
 111         },
 112     }, {
 113         # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
 114         'url': 'http://ok.ru/video/62036049272859-0',
 115         'info_dict': {
 116             'id': '62036049272859-0',
 117             'ext': 'mp4',
 118             'title': 'МУЗЫКА     ДОЖДЯ .',
 119             'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0',
 120             'upload_date': '20120106',
 121             'uploader_id': '473534735899',
 122             'uploader': 'МARINA D',
 123             'age_limit': 0,
 124         },
 125         'params': {
 126             'skip_download': True,
 127         },
 128         'skip': 'Video has not been found',
 129     }, {
 130         'note': 'Only available in mobile webpage',
 131         'url': 'https://m.ok.ru/video/2361249957145',
 132         'info_dict': {
 133             'id': '2361249957145',
 134             'title': 'Быковское крещение',
 135             'duration': 3038.181,
 136         },
 137     }, {
 138         'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
 139         'only_matching': True,
 140     }, {
 141         'url': 'http://www.ok.ru/video/20648036891',
 142         'only_matching': True,
 143     }, {
 144         'url': 'http://www.ok.ru/videoembed/20648036891',
 145         'only_matching': True,
 146     }, {
 147         'url': 'http://m.ok.ru/video/20079905452',
 148         'only_matching': True,
 149     }, {
 150         'url': 'http://mobile.ok.ru/video/20079905452',
 151         'only_matching': True,
 152     }, {
 153         'url': 'https://www.ok.ru/live/484531969818',
 154         'only_matching': True,
 155     }, {
 156         'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#',
 157         'only_matching': True,
 158     }, {
 159         # Paid video
 160         'url': 'https://ok.ru/video/954886983203',
 161         'only_matching': True,
 162     }]
 163
 164     @staticmethod
 165     def _extract_url(webpage):
 166         mobj = re.search(
 167             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
 168         if mobj:
 169             return mobj.group('url')
 170
 171     def _real_extract(self, url):
 172         try:
 173             return self._extract_desktop(url)
 174         except ExtractorError as e:
 175             try:
 176                 return self._extract_mobile(url)
 177             except ExtractorError:
 178                 # error message of desktop webpage is in English
 179                 raise e
 180
 181     def _extract_desktop(self, url):
 182         start_time = int_or_none(compat_parse_qs(
 183             compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
 184
 185         video_id = self._match_id(url)
 186
 187         webpage = self._download_webpage(
 188             'http://ok.ru/video/%s' % video_id, video_id,
 189             note='Downloading desktop webpage')
 190
 191         error = self._search_regex(
 192             r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
 193             webpage, 'error', default=None)
 194         if error:
 195             raise ExtractorError(error, expected=True)
 196
 197         player = self._parse_json(
 198             unescapeHTML(self._search_regex(
 199                 r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id,
 200                 webpage, 'player', group='player')),
 201             video_id)
 202
 203         # embedded external player
 204         if player.get('isExternalPlayer') and player.get('url'):
 205             return self.url_result(player['url'])
 206
 207         flashvars = player['flashvars']
 208
 209         metadata = flashvars.get('metadata')
 210         if metadata:
 211             metadata = self._parse_json(metadata, video_id)
 212         else:
 213             data = {}
 214             st_location = flashvars.get('location')
 215             if st_location:
 216                 data['st.location'] = st_location
 217             metadata = self._download_json(
 218                 compat_urllib_parse_unquote(flashvars['metadataUrl']),
 219                 video_id, 'Downloading metadata JSON',
 220                 data=urlencode_postdata(data))
 221
 222         movie = metadata['movie']
 223
 224         # Some embedded videos may not contain title in movie dict (e.g.
 225         # http://ok.ru/video/62036049272859-0) thus we allow missing title
 226         # here and it's going to be extracted later by an extractor that
 227         # will process the actual embed.
 228         provider = metadata.get('provider')
 229         title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title')
 230
 231         thumbnail = movie.get('poster')
 232         duration = int_or_none(movie.get('duration'))
 233
 234         author = metadata.get('author', {})
 235         uploader_id = author.get('id')
 236         uploader = author.get('name')
 237
 238         upload_date = unified_strdate(self._html_search_meta(
 239             'ya:ovs:upload_date', webpage, 'upload date', default=None))
 240
 241         age_limit = None
 242         adult = self._html_search_meta(
 243             'ya:ovs:adult', webpage, 'age limit', default=None)
 244         if adult:
 245             age_limit = 18 if adult == 'true' else 0
 246
 247         like_count = int_or_none(metadata.get('likeCount'))
 248
 249         info = {
 250             'id': video_id,
 251             'title': title,
 252             'thumbnail': thumbnail,
 253             'duration': duration,
 254             'upload_date': upload_date,
 255             'uploader': uploader,
 256             'uploader_id': uploader_id,
 257             'like_count': like_count,
 258             'age_limit': age_limit,
 259             'start_time': start_time,
 260         }
 261
 262         # pladform
 263         if provider == 'OPEN_GRAPH':
 264             info.update({
 265                 '_type': 'url_transparent',
 266                 'url': movie['contentId'],
 267             })
 268             return info
 269
 270         if provider == 'USER_YOUTUBE':
 271             info.update({
 272                 '_type': 'url_transparent',
 273                 'url': movie['contentId'],
 274             })
 275             return info
 276
 277         assert title
 278         if provider == 'LIVE_TV_APP':
 279             info['title'] = title
 280
 281         quality = qualities(('4', '0', '1', '2', '3', '5'))
 282
 283         formats = [{
 284             'url': f['url'],
 285             'ext': 'mp4',
 286             'format_id': f['name'],
 287         } for f in metadata['videos']]
 288
 289         m3u8_url = metadata.get('hlsManifestUrl')
 290         if m3u8_url:
 291             formats.extend(self._extract_m3u8_formats(
 292                 m3u8_url, video_id, 'mp4', 'm3u8_native',
 293                 m3u8_id='hls', fatal=False))
 294
 295         dash_manifest = metadata.get('metadataEmbedded')
 296         if dash_manifest:
 297             formats.extend(self._parse_mpd_formats(
 298                 compat_etree_fromstring(dash_manifest), 'mpd'))
 299
 300         for fmt in formats:
 301             fmt_type = self._search_regex(
 302                 r'\btype[/=](\d)', fmt['url'],
 303                 'format type', default=None)
 304             if fmt_type:
 305                 fmt['quality'] = quality(fmt_type)
 306
 307         # Live formats
 308         m3u8_url = metadata.get('hlsMasterPlaylistUrl')
 309         if m3u8_url:
 310             formats.extend(self._extract_m3u8_formats(
 311                 m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
 312         rtmp_url = metadata.get('rtmpUrl')
 313         if rtmp_url:
 314             formats.append({
 315                 'url': rtmp_url,
 316                 'format_id': 'rtmp',
 317                 'ext': 'flv',
 318             })
 319
 320         if not formats:
 321             payment_info = metadata.get('paymentInfo')
 322             if payment_info:
 323                 self.raise_no_formats('This video is paid, subscribe to download it', expected=True)
 324
 325         self._sort_formats(formats)
 326
 327         info['formats'] = formats
 328         return info
 329
 330     def _extract_mobile(self, url):
 331         video_id = self._match_id(url)
 332
 333         webpage = self._download_webpage(
 334             'http://m.ok.ru/video/%s' % video_id, video_id,
 335             note='Downloading mobile webpage')
 336
 337         error = self._search_regex(
 338             r'видео</a>\s*<div\s+class="empty">(.+?)</div>',
 339             webpage, 'error', default=None)
 340         if error:
 341             raise ExtractorError(error, expected=True)
 342
 343         json_data = self._search_regex(
 344             r'data-video="(.+?)"', webpage, 'json data')
 345         json_data = self._parse_json(unescapeHTML(json_data), video_id) or {}
 346
 347         return {
 348             'id': video_id,
 349             'title': json_data.get('videoName'),
 350             'duration': float_or_none(json_data.get('videoDuration'), scale=1000),
 351             'thumbnail': json_data.get('videoPosterSrc'),
 352             'formats': [{
 353                 'format_id': 'mobile',
 354                 'url': json_data.get('videoSrc'),
 355                 'ext': 'mp4',
 356             }]
 357         }