[extractor] Deprecate `_sort_formats`

[yt-dlp.git] / yt_dlp / extractor / odnoklassniki.py
diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py

index eeb57f2fed5869884cae80e6baa37d9616915023..4f325f08784892a03fe9c77327ff46021e2611fa 100644 (file)
--- a/yt_dlp/extractor/odnoklassniki.py
+++ b/yt_dlp/extractor/odnoklassniki.py
@@ -1,8 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
  from .common import InfoExtractor
  from ..compat import (
      compat_etree_fromstring,
@@ -13,10 +8,12 @@
  from ..utils import (
      ExtractorError,
      float_or_none,
-    unified_strdate,
      int_or_none,
      qualities,
+    smuggle_url,
      unescapeHTML,
+    unified_strdate,
+    unsmuggle_url,
      urlencode_postdata,
  )
  
@@ -27,21 +24,55 @@ class OdnoklassnikiIE(InfoExtractor):
                      (?:(?:www|m|mobile)\.)?
                      (?:odnoklassniki|ok)\.ru/
                      (?:
-                        video(?:embed)?/|
+                        video(?P<embed>embed)?/|
                          web-api/video/moviePlayer/|
                          live/|
                          dk\?.*?st\.mvId=
                      )
                      (?P<id>[\d-]+)
                  '''
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1']
      _TESTS = [{
+        'note': 'Coub embedded',
+        'url': 'http://ok.ru/video/1484130554189',
+        'info_dict': {
+            'id': '1keok9',
+            'ext': 'mp4',
+            'timestamp': 1545580896,
+            'view_count': int,
+            'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg',
+            'title': 'Народная забава',
+            'uploader': 'Nevata',
+            'upload_date': '20181223',
+            'age_limit': 0,
+            'uploader_id': 'nevata.s',
+            'like_count': int,
+            'duration': 8.08,
+            'repost_count': int,
+        },
+    }, {
+        'note': 'vk.com embedded',
+        'url': 'https://ok.ru/video/3568183087575',
+        'info_dict': {
+            'id': '-165101755_456243749',
+            'ext': 'mp4',
+            'uploader_id': '-165101755',
+            'duration': 132,
+            'timestamp': 1642869935,
+            'upload_date': '20220122',
+            'thumbnail': str,
+            'title': str,
+            'uploader': str,
+        },
+    }, {
          # metadata in JSON
          'url': 'http://ok.ru/video/20079905452',
-        'md5': '0b62089b479e06681abaaca9d204f152',
+        'md5': '5d2b64756e2af296e3b383a0bc02a6aa',
          'info_dict': {
              'id': '20079905452',
              'ext': 'mp4',
              'title': 'Культура меняет нас (прекрасный ролик!))',
+            'thumbnail': str,
              'duration': 100,
              'upload_date': '20141207',
              'uploader_id': '330537914540',
@@ -52,11 +83,12 @@ class OdnoklassnikiIE(InfoExtractor):
      }, {
          # metadataUrl
          'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
-        'md5': '6ff470ea2dd51d5d18c295a355b0b6bc',
+        'md5': 'f8c951122516af72e6e6ffdd3c41103b',
          'info_dict': {
              'id': '63567059965189-0',
              'ext': 'mp4',
              'title': 'Девушка без комплексов ...',
+            'thumbnail': str,
              'duration': 191,
              'upload_date': '20150518',
              'uploader_id': '534380003155',
@@ -67,18 +99,32 @@ class OdnoklassnikiIE(InfoExtractor):
          },
      }, {
          # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
-        'url': 'http://ok.ru/video/64211978996595-1',
-        'md5': '2f206894ffb5dbfcce2c5a14b909eea5',
+        'url': 'https://ok.ru/video/3952212382174',
+        'md5': '91749d0bd20763a28d083fa335bbd37a',
          'info_dict': {
-            'id': 'V_VztHT5BzY',
+            'id': '5axVgHHDBvU',
              'ext': 'mp4',
-            'title': 'Космическая среда от 26 августа 2015',
-            'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0',
-            'duration': 440,
-            'upload_date': '20150826',
-            'uploader_id': 'tvroscosmos',
-            'uploader': 'Телестудия Роскосмоса',
+            'title': 'Youtube-dl 101: What is it and HOW to use it! Full Download Walkthrough and Guide',
+            'description': 'md5:b57209eeb9d5c2f20c984dfb58862097',
+            'uploader': 'Lod Mer',
+            'uploader_id': '575186401502',
+            'duration': 1529,
              'age_limit': 0,
+            'upload_date': '20210405',
+            'comment_count': int,
+            'live_status': 'not_live',
+            'view_count': int,
+            'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8',
+            'uploader_url': 'http://www.youtube.com/user/MrKewlkid94',
+            'channel_follower_count': int,
+            'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'],
+            'channel_id': 'UCVGtvURtEURYHtJFUegdSug',
+            'like_count': int,
+            'availability': 'public',
+            'channel_url': 'https://www.youtube.com/channel/UCVGtvURtEURYHtJFUegdSug',
+            'categories': ['Education'],
+            'playable_in_embed': True,
+            'channel': 'BornToReact',
          },
      }, {
          # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
@@ -98,10 +144,12 @@ class OdnoklassnikiIE(InfoExtractor):
          },
          'skip': 'Video has not been found',
      }, {
+        # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading
          'note': 'Only available in mobile webpage',
          'url': 'https://m.ok.ru/video/2361249957145',
          'info_dict': {
              'id': '2361249957145',
+            'ext': 'mp4',
              'title': 'Быковское крещение',
              'duration': 3038.181,
          },
@@ -130,14 +178,36 @@ class OdnoklassnikiIE(InfoExtractor):
          # Paid video
          'url': 'https://ok.ru/video/954886983203',
          'only_matching': True,
+    }, {
+        'url': 'https://ok.ru/videoembed/2932705602075',
+        'info_dict': {
+            'id': '2932705602075',
+            'ext': 'mp4',
+            'thumbnail': 'https://i.mycdn.me/videoPreview?id=1369902483995&type=37&idx=2&tkn=fqlnoQD_xwq5ovIlKfgNyU08qmM&fn=external_8',
+            'title': 'Boosty для тебя!',
+            'uploader_id': '597811038747',
+            'like_count': 0,
+            'duration': 35,
+        },
      }]
  
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
-        if mobj:
-            return mobj.group('url')
+    _WEBPAGE_TESTS = [{
+        'url': 'https://boosty.to/ikakprosto/posts/56cedaca-b56a-4dfd-b3ed-98c79cfa0167',
+        'info_dict': {
+            'id': '3950343629563',
+            'ext': 'mp4',
+            'thumbnail': 'https://i.mycdn.me/videoPreview?id=2776238394107&type=37&idx=11&tkn=F3ejkUFcpuI4DnMRxrDGcH5YcmM&fn=external_8',
+            'title': 'Заяц Бусти.mp4',
+            'uploader_id': '571368965883',
+            'like_count': 0,
+            'duration': 10444,
+        },
+    }]
+
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
+        for x in super()._extract_embed_urls(url, webpage):
+            yield smuggle_url(x, {'referrer': url})
  
      def _real_extract(self, url):
          try:
@@ -153,16 +223,23 @@ def _extract_desktop(self, url):
          start_time = int_or_none(compat_parse_qs(
              compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
  
-        video_id = self._match_id(url)
+        url, smuggled = unsmuggle_url(url, {})
+        video_id, is_embed = self._match_valid_url(url).group('id', 'embed')
+        mode = 'videoembed' if is_embed else 'video'
  
          webpage = self._download_webpage(
-            'http://ok.ru/video/%s' % video_id, video_id,
-            note='Downloading desktop webpage')
+            f'https://ok.ru/{mode}/{video_id}', video_id,
+            note='Downloading desktop webpage',
+            headers={'Referer': smuggled['referrer']} if smuggled.get('referrer') else {})
  
          error = self._search_regex(
              r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
              webpage, 'error', default=None)
-        if error:
+        # Direct link from boosty
+        if (error == 'The author of this video has not been found or is blocked'
+                and not smuggled.get('referrer') and mode == 'videoembed'):
+            return self._extract_desktop(smuggle_url(url, {'referrer': 'https://boosty.to'}))
+        elif error:
              raise ExtractorError(error, expected=True)
  
          player = self._parse_json(
@@ -171,6 +248,10 @@ def _extract_desktop(self, url):
                  webpage, 'player', group='player')),
              video_id)
  
+        # embedded external player
+        if player.get('isExternalPlayer') and player.get('url'):
+            return self.url_result(player['url'])
+
          flashvars = player['flashvars']
  
          metadata = flashvars.get('metadata')
@@ -226,6 +307,14 @@ def _extract_desktop(self, url):
              'start_time': start_time,
          }
  
+        # pladform
+        if provider == 'OPEN_GRAPH':
+            info.update({
+                '_type': 'url_transparent',
+                'url': movie['contentId'],
+            })
+            return info
+
          if provider == 'USER_YOUTUBE':
              info.update({
                  '_type': 'url_transparent',
@@ -235,9 +324,9 @@ def _extract_desktop(self, url):
  
          assert title
          if provider == 'LIVE_TV_APP':
-            info['title'] = self._live_title(title)
+            info['title'] = title
  
-        quality = qualities(('4', '0', '1', '2', '3', '5'))
+        quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7'))
  
          formats = [{
              'url': f['url'],
@@ -281,8 +370,6 @@ def _extract_desktop(self, url):
              if payment_info:
                  self.raise_no_formats('This video is paid, subscribe to download it', expected=True)
  
-        self._sort_formats(formats)
-
          info['formats'] = formats
          return info