[cleanup] Add more ruff rules (#10149)

[yt-dlp.git] / yt_dlp / extractor / anvato.py
diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py

index 7386ae2e6bd35ea305972b60abd2856e7fde25cc..bf3d60b5ee5eacbd207e4d2406f4ffb96d3fabbe 100644 (file)
--- a/yt_dlp/extractor/anvato.py
+++ b/yt_dlp/extractor/anvato.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import base64
  import hashlib
  import json
@@ -10,37 +7,68 @@
  
  from .common import InfoExtractor
  from ..aes import aes_encrypt
-from ..compat import compat_str
  from ..utils import (
      bytes_to_intlist,
      determine_ext,
-    intlist_to_bytes,
      int_or_none,
+    intlist_to_bytes,
+    join_nonempty,
+    smuggle_url,
      strip_jsonp,
+    traverse_obj,
      unescapeHTML,
      unsmuggle_url,
  )
  
-# This import causes a ModuleNotFoundError on some systems for unknown reason.
-# See issues:
-# https://github.com/yt-dlp/yt-dlp/issues/35
-# https://github.com/ytdl-org/youtube-dl/issues/27449
-# https://github.com/animelover1984/youtube-dl/issues/17
-try:
-    from .anvato_token_generator import NFLTokenGenerator
-except ImportError:
-    NFLTokenGenerator = None
-
  
  def md5_text(s):
-    if not isinstance(s, compat_str):
-        s = compat_str(s)
-    return hashlib.md5(s.encode('utf-8')).hexdigest()
+    return hashlib.md5(str(s).encode()).hexdigest()
  
  
  class AnvatoIE(InfoExtractor):
      _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
  
+    _API_BASE_URL = 'https://tkx.mp.lura.live/rest/v2'
+    _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
+    _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'  # from anvplayer.min.js
+
+    _TESTS = [{
+        # from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14
+        'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441',
+        'md5': '921919dab3cd0b849ff3d624831ae3e2',
+        'info_dict': {
+            'id': '899441',
+            'ext': 'mp4',
+            'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14',
+            'description': 'md5:85e05a3cc163f8c344340f220521136d',
+            'upload_date': '20201215',
+            'timestamp': 1608009755,
+            'thumbnail': r're:^https?://.*\.jpg',
+            'uploader': 'NFL',
+            'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights',
+                     'Player Highlights', 'Cleveland Browns', 'league'],
+            'duration': 157,
+            'categories': ['Entertainment', 'Game', 'Highlights'],
+        },
+    }, {
+        # from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/
+        'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455',
+        'md5': '837718bcfb3a7778d022f857f7a9b19e',
+        'info_dict': {
+            'id': '8032455',
+            'ext': 'mp4',
+            'title': '99-year-old woman learns to fly plane in Torrance, checks off bucket list dream',
+            'description': 'md5:0a12bab8159445e78f52a297a35c6609',
+            'upload_date': '20220928',
+            'timestamp': 1664408881,
+            'thumbnail': r're:^https?://.*\.jpg',
+            'uploader': 'LIN',
+            'tags': ['video', 'news', '5live'],
+            'duration': 155,
+            'categories': ['News'],
+        },
+    }]
+
      # Copied from anvplayer.min.js
      _ANVACK_TABLE = {
          'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
@@ -210,89 +238,77 @@ class AnvatoIE(InfoExtractor):
          'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
          'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
          'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
-        'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
+        'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582',
      }
  
-    _TOKEN_GENERATORS = {
-        'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator,
+    def _generate_nfl_token(self, anvack, mcp_id):
+        reroute = self._download_json(
+            'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials',
+            headers={'X-Domain-Id': 100}, note='Fetching token info')
+        token_type = reroute.get('token_type') or 'Bearer'
+        auth_token = f'{token_type} {reroute["access_token"]}'
+        response = self._download_json(
+            'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
+                'query': '''{
+  viewer {
+    mediaToken(anvack: "%s", id: %s) {
+      token
      }
+  }
+}''' % (anvack, mcp_id),  # noqa: UP031
+            }).encode(), headers={
+                'Authorization': auth_token,
+                'Content-Type': 'application/json',
+            }, note='Fetching NFL API token')
+        return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token'))
  
-    _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA'
-
-    _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
-    _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
-
-    _TESTS = [{
-        # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874
-        'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496',
-        'info_dict': {
-            'id': '4465496',
-            'ext': 'mp4',
-            'title': 'VIDEO: Humpback whale breaches right next to NH boat',
-            'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.',
-            'duration': 22,
-            'timestamp': 1534855680,
-            'upload_date': '20180821',
-            'uploader': 'ANV',
-        },
-        'params': {
-            'skip_download': True,
-        },
-    }, {
-        # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/
-        'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601',
-        'only_matching': True,
-    }]
-
-    def __init__(self, *args, **kwargs):
-        super(AnvatoIE, self).__init__(*args, **kwargs)
-        self.__server_time = None
+    _TOKEN_GENERATORS = {
+        'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token,
+    }
  
      def _server_time(self, access_key, video_id):
-        if self.__server_time is not None:
-            return self.__server_time
-
-        self.__server_time = int(self._download_json(
-            self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id,
-            note='Fetching server time')['server_time'])
-
-        return self.__server_time
+        return int_or_none(traverse_obj(self._download_json(
+            f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key},
+            note='Fetching server time', fatal=False), 'server_time')) or int(time.time())
  
-    def _api_prefix(self, access_key):
-        return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage')
-
-    def _get_video_json(self, access_key, video_id):
+    def _get_video_json(self, access_key, video_id, extracted_token):
          # See et() in anvplayer.min.js, which is an alias of getVideoJSON()
-        video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key)
+        video_data_url = f'{self._API_BASE_URL}/mcp/video/{video_id}?anvack={access_key}'
          server_time = self._server_time(access_key, video_id)
-        input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time))
+        input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}'
  
          auth_secret = intlist_to_bytes(aes_encrypt(
              bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
-
-        video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii')
+        query = {
+            'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'),
+            'rtyp': 'fp',
+        }
          anvrid = md5_text(time.time() * 1000 * random.random())[:30]
          api = {
              'anvrid': anvrid,
              'anvts': server_time,
          }
-        if self._TOKEN_GENERATORS.get(access_key) is not None:
-            api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id)
+        if extracted_token is not None:
+            api['anvstk2'] = extracted_token
+        elif self._TOKEN_GENERATORS.get(access_key) is not None:
+            api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id)
+        elif self._ANVACK_TABLE.get(access_key) is not None:
+            api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}')
          else:
-            api['anvstk'] = md5_text('%s|%s|%d|%s' % (
-                access_key, anvrid, server_time,
-                self._ANVACK_TABLE.get(access_key, self._API_KEY)))
+            api['anvstk2'] = 'default'
  
          return self._download_json(
-            video_data_url, video_id, transform_source=strip_jsonp,
-            data=json.dumps({'api': api}).encode('utf-8'))
+            video_data_url, video_id, transform_source=strip_jsonp, query=query,
+            data=json.dumps({'api': api}, separators=(',', ':')).encode())
  
-    def _get_anvato_videos(self, access_key, video_id):
-        video_data = self._get_video_json(access_key, video_id)
+    def _get_anvato_videos(self, access_key, video_id, token):
+        video_data = self._get_video_json(access_key, video_id, token)
  
          formats = []
          for published_url in video_data['published_urls']:
-            video_url = published_url['embed_url']
+            video_url = published_url.get('embed_url')
+            if not video_url:
+                continue
              media_format = published_url.get('format')
              ext = determine_ext(video_url)
  
@@ -303,19 +319,31 @@ def _get_anvato_videos(self, access_key, video_id):
              tbr = int_or_none(published_url.get('kbps'))
              a_format = {
                  'url': video_url,
-                'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(),
-                'tbr': tbr if tbr != 0 else None,
+                'format_id': join_nonempty('http', published_url.get('cdn_name')).lower(),
+                'tbr': tbr or None,
              }
  
-            if media_format == 'm3u8' and tbr is not None:
+            vtt_subs, hls_subs = {}, {}
+            if media_format == 'vtt':
+                _, vtt_subs = self._extract_m3u8_formats_and_subtitles(
+                    video_url, video_id, m3u8_id='vtt', fatal=False)
+                continue
+            elif media_format == 'm3u8' and tbr is not None:
                  a_format.update({
-                    'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
+                    'format_id': join_nonempty('hls', tbr),
                      'ext': 'mp4',
                  })
              elif media_format == 'm3u8-variant' or ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
-                    m3u8_id='hls', fatal=False))
+                # For some videos the initial m3u8 URL returns JSON instead
+                manifest_json = self._download_json(
+                    video_url, video_id, note='Downloading manifest JSON', fatal=False)
+                if manifest_json:
+                    video_url = manifest_json.get('master_m3u8')
+                    if not video_url:
+                        continue
+                hls_fmts, hls_subs = self._extract_m3u8_formats_and_subtitles(
+                    video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)
+                formats.extend(hls_fmts)
                  continue
              elif ext == 'mp3' or media_format == 'mp3':
                  a_format['vcodec'] = 'none'
@@ -326,15 +354,14 @@ def _get_anvato_videos(self, access_key, video_id):
                  })
              formats.append(a_format)
  
-        self._sort_formats(formats)
-
          subtitles = {}
          for caption in video_data.get('captions', []):
              a_caption = {
                  'url': caption['url'],
-                'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None
+                'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None,
              }
              subtitles.setdefault(caption['language'], []).append(a_caption)
+        subtitles = self._merge_subtitles(subtitles, hls_subs, vtt_subs)
  
          return {
              'id': video_id,
@@ -351,38 +378,19 @@ def _get_anvato_videos(self, access_key, video_id):
              'subtitles': subtitles,
          }
  
-    @staticmethod
-    def _extract_urls(ie, webpage, video_id):
-        entries = []
-        for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
-            anvplayer_data = ie._parse_json(
-                mobj.group('anvp'), video_id, transform_source=unescapeHTML,
-                fatal=False)
-            if not anvplayer_data:
-                continue
-            video = anvplayer_data.get('video')
-            if not isinstance(video, compat_str) or not video.isdigit():
-                continue
-            access_key = anvplayer_data.get('accessKey')
-            if not access_key:
-                mcp = anvplayer_data.get('mcp')
-                if mcp:
-                    access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
-                        mcp.lower())
+    @classmethod
+    def _extract_from_webpage(cls, url, webpage):
+        for mobj in re.finditer(cls._ANVP_RE, webpage):
+            anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {}
+            video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey')
              if not access_key:
+                access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower())
+            if not (video_id or '').isdigit() or not access_key:
                  continue
-            entries.append(ie.url_result(
-                'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
-                video_id=video))
-        return entries
-
-    def _extract_anvato_videos(self, webpage, video_id):
-        anvplayer_data = self._parse_json(
-            self._html_search_regex(
-                self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
-            video_id)
-        return self._get_anvato_videos(
-            anvplayer_data['accessKey'], anvplayer_data['video'])
+            url = f'anvato:{access_key}:{video_id}'
+            if anvplayer_data.get('token'):
+                url = smuggle_url(url, {'token': anvplayer_data['token']})
+            yield cls.url_result(url, AnvatoIE, video_id)
  
      def _real_extract(self, url):
          url, smuggled_data = unsmuggle_url(url, {})
@@ -390,9 +398,7 @@ def _real_extract(self, url):
              'countries': smuggled_data.get('geo_countries'),
          })
  
-        mobj = re.match(self._VALID_URL, url)
-        access_key, video_id = mobj.group('access_key_or_mcp', 'id')
+        access_key, video_id = self._match_valid_url(url).group('access_key_or_mcp', 'id')
          if access_key not in self._ANVACK_TABLE:
-            access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(
-                access_key) or access_key
-        return self._get_anvato_videos(access_key, video_id)
+            access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(access_key) or access_key
+        return self._get_anvato_videos(access_key, video_id, smuggled_data.get('token'))